Source code for cachai.data.remote_data

import os
import time
import pandas as pd
import hashlib
import warnings
import json
from   urllib.request import urlopen, Request
from   urllib.error import URLError
from  ._utils import sizeof_fmt, CACHE_DIR, DATASETS_REPO, DATASETS_CATALOG

def _get_cache_path(url):
    """Generate cache path using URL hash"""
    url_hash = hashlib.md5(url.encode()).hexdigest()
    return os.path.join(CACHE_DIR, url_hash)

def _download_with_cache(url, force=False):
    """Download file with persistent cache system"""
    os.makedirs(CACHE_DIR, exist_ok=True)
    cache_path = _get_cache_path(url)
    
    if not force and os.path.exists(cache_path):
        return cache_path

    try:
        req = Request(url, headers={'User-Agent': 'CACHAI'})
        with urlopen(req) as response:
            data = response.read()
            with open(cache_path, 'wb') as f:
                f.write(data)
        return cache_path
    except URLError as e:
        # Fallback to existing cache
        if os.path.exists(cache_path):
            warnings.warn(f"Using cached version due to an error. Details: {str(e)}")
            return cache_path
        raise ConnectionError(f"Error downloading {url}. Details: {str(e)}")

def _get_datasets_catalog(force=False):
    """Obtain the dataset catalog from GitHub"""
    url = DATASETS_REPO + DATASETS_CATALOG
    cached_file = _download_with_cache(url, force)
    
    with open(cached_file, 'r', encoding='utf-8') as f:
        return json.load(f)


[docs]
def get_dataset_repo():
    """
    Return the URL of the **cachai** datasets repository.

    Returns
        :class:`str`
            GitHub URL of the dataset repository.

    Examples
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    .. code-block:: python
        :class: in-block

        import cachai.data as chd

        print(chd.get_dataset_repo())

    .. code-block:: text
        :class: out-block

        https://github.com/DD-Beltran-F/cachai-datasets
    """
    return 'https://github.com/DD-Beltran-F/cachai-datasets'



[docs]
def get_dataset_names():
    """
    Retrieve the list of available dataset names.

    Returns
        :class:`list` of :class:`str`
            Names of the datasets available in the catalog.

    Examples
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    .. code-block:: python
        :class: in-block

        import cachai.data as chd

        print(chd.get_dataset_names())

    .. code-block:: text
        :class: out-block

        ["lithium", "correlations", "correlations_big"]
    """
    catalog = _get_datasets_catalog(True)
    return list(catalog.keys())



[docs]
def get_dataset_metadata(name):
    """
    Print the metadata of a specific dataset.

    Parameters
        name : :class:`str`
            Name of the dataset.

    Examples
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    .. code-block:: python
        :class: in-block

        import cachai.data as chd

        chd.get_dataset_metadata('lithium')

    .. code-block:: text
        :class: out-block

        ══════════════════════════════════════════════════════════════════════════════════════════
        METADATA OF DATASET: LITHIUM
        ──────────────────────────────────────────────────────────────────────────────────────────
        Alias       : lithium
        Filename    : lithium.csv
        Description : Data of lithium abundances and stellar parameters from  M. L. L. Dantas et
                      al. (2025) doi: 10.1051/0004-6361/202453034
        Columns     : CNAME, [Fe/H], A(Li), \\overline{t}_{\\star}, M, T_{eff}, e, Z_{max}, L_z
        ══════════════════════════════════════════════════════════════════════════════════════════
    """
    catalog = _get_datasets_catalog(True)

    if name not in catalog:
        raise ValueError(f"Dataset '{name}' does not exist. "
                         f"The current valid datasets are: {', '.join(get_dataset_names())}.")

    dataset_meta = catalog[name]

    print('═'*50)
    print(f'METADATA OF DATASET: {name.upper()}')
    print('─'*50)
    print(f'Alias       : {name}')
    print(f"Filename    : {dataset_meta.get('filename', 'Not specified')}")
    print(f"Description : {dataset_meta.get('description', 'Not available')}")
    columns = dataset_meta.get('columns', 'Not specified')
    if isinstance(columns,list): columns = ', '.join(columns).replace('$','')
    print(f"Columns     : {columns}")
    print("═"*50 + "\n")



[docs]
def load_dataset(name="",redownload=False):
    """
    Load a dataset from GitHub with a persistent cache system.

    Parameters
        name : :class:`str`
            Name of the dataset to load.
        redownload : :class:`bool`, optional
            Whether to force re-downloading the dataset, ignoring the cache
            (default: ``False``).

    Returns
        :class:`pandas.DataFrame`
            DataFrame containing the dataset.

    Examples
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    .. code-block:: python
        :class: in-block

        import cachai.data as chd

        df = chd.load_dataset('lithium')
        print(df.head())

    .. code-block:: text
        :class: out-block

                      CNAME  [Fe/H]  A(Li)  ...         e  Z$_{max}$        $L_z$
        0  00000302-6002570   -0.31   2.01  ...  0.119362   1.003228  2028.535804
        1  00001749-5449565   -0.17   1.98  ...  0.110635   0.670432  1907.144965
        2  00012216-5458205   -0.07   1.51  ...  0.276396   0.996552  1836.529851
        3  00040666-3709129    0.28   0.62  ...  0.112774   0.501939  1676.768299
        4  00042981-4701022   -0.34   1.71  ...  0.257109   1.327526  2239.902280

        [5 rows x 9 columns]
        ...
    """
    catalog = _get_datasets_catalog(redownload)
    
    if name not in catalog:
        raise ValueError(f"Dataset '{name}' does not exist. "
                         f"The current valid datasets are: {', '.join(get_dataset_names())}.")
    
    url = DATASETS_REPO + catalog[name]['filename']
    cached_file = _download_with_cache(url, redownload)
    
    return pd.read_csv(cached_file)



[docs]
def clear_cache(max_age_days=0):
    """
    Delete old cached files from **cachai**'s cache directory.

    Parameters
        max_age_days : :class:`int`, optional
            Maximum file age in days. Files older than this will be deleted.
            If set to ``0`` (default), all files are removed.

    Examples
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    .. code-block:: python
        :class: in-block

        import cachai.data as chd

        # Delete all cached files
        chd.clear_cache()

    .. code-block:: text
        :class: out-block

        3 file(s) deleted by 2025-09-29 (14:43:47) from cachai's cache folder.
        Space freed: 153.1 KB (100.0%).
    
    You can also choose the maximum file age in days, files older than ``max_age_days`` will be
    deleted:

    .. code-block:: python
        :class: mock-block

        import cachai.data as chd

        # Delete files older than 30 days
        clear_cache(max_age_days=30)
    """
    now         = time.time()
    counts      = 0
    total       = 0
    freed_space = 0
    
    total_space = sum(
        os.path.getsize(os.path.join(CACHE_DIR, f)) 
        for f in os.listdir(CACHE_DIR) 
        if os.path.isfile(os.path.join(CACHE_DIR, f)))

    if total_space == 0:
        print("cachai's cache folder is already empty.")
        return

    if os.path.exists(CACHE_DIR):
        total = len(os.listdir(CACHE_DIR))
        for filename in os.listdir(CACHE_DIR):
            filepath = os.path.join(CACHE_DIR, filename)
            if os.stat(filepath).st_mtime < now - max_age_days * 86400:
                file_size = os.path.getsize(filepath)
                try:
                    os.remove(filepath)
                    counts += 1
                    freed_space += file_size
                except Exception as e:
                    warnings.warn(f'Could not delete {filepath}. Details: {str(e)}')
    
    now_str = time.strftime("%Y-%m-%d (%H:%M:%S)", time.localtime(now))
    print(f"{counts} file(s) deleted by {now_str} from cachai's cache folder.\n"
          f'Space freed: {sizeof_fmt(freed_space)} ({freed_space/total_space*100:.1f}%).')