import os
import re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
[docs]
class CBEOpenDataPortal:
"""
A class to interact with the CBE (Crossroads Bank for Enterprises) Open Data portal.
This class provides methods to authenticate with the CBE Open Data portal, retrieve available data extracts,
and download specific or the latest extract files. The data extracts are typically provided as ZIP files
containing CSV files, which can be used for further processing and analysis.
Attributes:
BASE_URL (str): The base URL for the CBE Open Data portal.
LOGIN_ENDPOINT (str): The endpoint for authenticating with the CBE Open Data portal.
EXTRACT_PAGE (str): The URL of the page listing available extract files.
EXTRACT_FILE_NAME (str): A regex pattern to parse extract file names and extract metadata.
Methods:
__init__: Initializes the CBEOpenDataPortal instance and logs in to the CBE Open Data portal.
login_to_cbe: Logs into the CBE Open Data portal and returns an authenticated session.
list_available_extracts: Retrieves a list of available extract files from the CBE Open Data portal.
get_extract_url: Retrieves the file url of a specific or the latest CBE extract file.
download_zip: Downloads a CBE extract file from the portal and saves it locally.
Example:
>>> portal = CBEOpenDataPortal(username="your_username", password="your_password")
>>> extracts = portal.list_available_extracts()
>>> for extract in extracts:
... print(f"Extract: {extract}")
>>> extract_url = portal.get_extract_url(extract_number="0133")
>>> print(f"Extract Url: {extract_url}")
>>> portal.download_zip(extract_number="0133")
Notes:
- The CBE Open Data portal requires authentication to access extract files.
- The class uses the `requests` library for HTTP requests and `BeautifulSoup` for parsing HTML.
- Extracts are identified by their extract number, year, month, and type (e.g., "Full" or "Update").
"""
BASE_URL = "https://kbopub.economie.fgov.be/kbo-open-data"
LOGIN_ENDPOINT = f"{BASE_URL}/static/j_spring_security_check"
# HTML page with the extract files listed
EXTRACT_PAGE = f"{BASE_URL}/affiliation/xml/?files"
# Files will be found at BASE_EXTRACT_URL + "/files/" + <file_name>
BASE_EXTRACT_URL = f"{BASE_URL}/affiliation/xml"
# Regex pattern to parse extract file names and extract metadata
# Example file name: KboOpenData_0133_2025_03_Full.zip
EXTRACT_FILE_NAME = r".*(?P<extract_number>\d{4})_(?P<year>\d{4})_(?P<month_num>\d{2})_.*(?P<extract_type>Full|Update).zip"
[docs]
def __init__(self, username: str, password: str) -> None:
self.session = self.login_to_cbe(username, password)
[docs]
@staticmethod
def login_to_cbe(username: str, password: str) -> requests.Session:
"""
Logs into the CBE Open Data portal and returns an authenticated session.
Args:
username (str): The username for logging in.
password (str): The password for logging in.
Returns:
requests.Session: An authenticated session object for subsequent requests.
Raises:
ValueError: If authentication fails.
"""
session = requests.Session()
payload = {"j_username": username, "j_password": password}
login_response = session.post(CBEOpenDataPortal.LOGIN_ENDPOINT, data=payload)
if login_response.status_code != 200:
raise ValueError(f"Login failed. response status {login_response.status_code}.")
if username not in login_response.text:
raise ValueError("Login failed. recieved unexpected response.")
print("Login successful!")
return session
[docs]
def download_zip(
self, extract_number: str = None, extract_type: str = "Full", download_dir: str = "data", force_download: bool = False
) -> str:
"""
Downloads the CBE Open Data ZIP file from the official CBE portal.
This method downloads the ZIP file for the specified extract (or the latest one).
The downloaded file is saved in `download_dir` with its original name (e.g., `KboOpenData_0133_2025_03_Full.zip`).
Args:
extract_number (str, optional): The extract number to download. Defaults to latest.
extract_type (str, optional): The extract type, either 'Full' or 'Update'. Defaults to 'Full'.
download_dir (str, optional): Directory to save the downloaded ZIP file. Defaults to "data".
force_download (bool, optional): Whether to overwrite an existing file. Defaults to False.
Returns:
str: The path of the downloaded ZIP file.
Raises:
ValueError: If the download process encounters an error or if extract is not found.
"""
# Ensure the download directory exists
if not os.path.exists(download_dir):
raise ValueError(f"Download directory '{download_dir}' does not exist.")
extract_url = self.get_extract_url(extract_number, extract_type)
file_name = extract_url.rsplit("/", 1)[-1]
file_path = os.path.join(download_dir, file_name)
if os.path.exists(file_path) and not force_download:
print(f"File already exists: {file_path}. Use 'force_download=True' to overwrite.")
return file_path
zip_response = self.session.get(extract_url, stream=True)
if zip_response.status_code != 200:
raise ValueError(f"Failed to download ZIP file. Status code: {zip_response.status_code}")
print(f"Downloading: {extract_url}")
with open(file_path, "wb") as f:
for chunk in zip_response.iter_content(chunk_size=8192):
f.write(chunk)
print("ZIP file downloaded and saved!")
return file_path