Source code for pleiades.utils.files

"""
File utilities for PLEIADES neutron imaging data processing.

This module provides utilities for file discovery, metadata extraction, and data export
operations. It includes functions for finding image files with dominant extensions,
extracting timing information from filenames, and exporting processed data to ASCII format.

The module supports:
- Automatic file discovery with extension filtering
- Filename-based metadata extraction for neutron imaging files
- ASCII data export with proper formatting
- Robust error handling for file operations

Example:
    Basic file discovery and export:

    >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder("/path/to/data")
    >>> print(f"Found {len(files)} {ext} files")
    >>>
    >>> data_dict = {"energy": [1, 2, 3], "transmission": [0.8, 0.6, 0.4]}
    >>> export_ascii(data_dict, "output.txt")
"""

import glob
import os
from collections import Counter
from typing import Any, Dict, List, Tuple, Union

import pandas as pd

from pleiades.utils.logger import loguru_logger

logger = loguru_logger.bind(name="files")



[docs]
def retrieve_list_of_most_dominant_extension_from_folder(
    folder: str = "", files: List[str] = None
) -> Tuple[List[str], str]:
    """
    Find and return files with the most common extension from a folder or file list.

    Analyzes a folder or list of files to determine the most frequently occurring
    file extension, then returns all files with that extension. This is useful
    for automatically detecting the primary data format in imaging directories.

    Args:
        folder (str, optional): Path to folder to search for files. If provided,
                              files parameter is ignored. Defaults to "".
        files (List[str], optional): List of file paths to analyze. Only used
                                   if folder is empty. Defaults to None.

    Returns:
        Tuple[List[str], str]: A tuple containing:
            - List of absolute file paths with the dominant extension, sorted alphabetically
            - The dominant file extension (e.g., '.tiff', '.fits')

    Example:
        From folder:
        >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder("/path/to/data")
        >>> print(f"Found {len(files)} files with extension {ext}")
        Found 100 files with extension .tiff

        From file list:
        >>> file_list = ["/path/file1.tiff", "/path/file2.tiff", "/path/file3.fits"]
        >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder(files=file_list)
        >>> ext
        '.tiff'

    Note:
        - If folder is provided, it takes precedence over files parameter
        - Files are returned as absolute paths and sorted alphabetically
        - Extension counting is case-sensitive
        - Hidden files (starting with '.') are included in the search

    Raises:
        FileNotFoundError: If folder doesn't exist
        ValueError: If no files are found or all files lack extensions
    """

    # Handle default mutable argument
    if files is None:
        files = []

    if folder:
        if not os.path.exists(folder):
            raise FileNotFoundError(f"Folder does not exist: {folder}")
        list_of_input_files = glob.glob(os.path.join(folder, "*"))
    else:
        list_of_input_files = files

    if not list_of_input_files:
        raise ValueError("No files found to analyze")

    list_of_input_files.sort()
    list_of_base_name = [os.path.basename(_file) for _file in list_of_input_files]

    # work with the largest common file extension from the folder selected

    counter_extension = Counter()
    for _file in list_of_base_name:
        [_base, _ext] = os.path.splitext(_file)
        if _ext:  # Only count files with extensions
            counter_extension[_ext] += 1

    if not counter_extension:
        raise ValueError("No files with extensions found")

    dominand_extension = ""
    dominand_number = 0
    for _key in counter_extension.keys():
        if counter_extension[_key] > dominand_number:
            dominand_extension = _key
            dominand_number = counter_extension[_key]

    list_of_input_files = glob.glob(os.path.join(folder, "*" + dominand_extension))
    list_of_input_files.sort()

    list_of_input_files = [os.path.abspath(_file) for _file in list_of_input_files]

    return (list_of_input_files, dominand_extension)




[docs]
def retrieve_number_of_frames_from_file_name(file_name: str) -> int:
    """
    Extract the number of time-of-flight frames from a neutron imaging filename.

    Parses specially formatted filenames to extract the number of time frames.
    The expected format includes 'T' followed by the frame count, then 'p'.
    This is commonly used in neutron imaging file naming conventions.

    Args:
        file_name (str): Filename containing frame information in the format
                        '...T{frame_count}p...'. Example:
                        'image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff'

    Returns:
        int: Number of time-of-flight frames extracted from the filename

    Example:
        >>> filename = "image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff"
        >>> frames = retrieve_number_of_frames_from_file_name(filename)
        >>> frames
        2000

        >>> filename = "data_T500p.fits"
        >>> frames = retrieve_number_of_frames_from_file_name(filename)
        >>> frames
        500

    Raises:
        ValueError: If the filename doesn't contain required 'T' and 'p' markers
        ValueError: If the extracted value cannot be converted to an integer

    Note:
        - The function looks for the pattern 'T{number}p' in the filename
        - Only the basename of the file is considered (path is ignored)
        - The number must be a valid integer
    """

    # Extract basename to work with filename only
    base_name = os.path.basename(file_name)

    # using regex-like string parsing to find the number of frames in the file name
    if "T" in base_name and "p" in base_name:
        try:
            return int(base_name.split("T")[1].split("p")[0])
        except (IndexError, ValueError) as e:
            raise ValueError(f"Could not extract number of frames from file name: {file_name}") from e
    else:
        raise ValueError(f"File name does not contain required 'T' and 'p' markers: {file_name}")




[docs]
def retrieve_time_bin_size_from_file_name(file_name: str) -> float:
    """
    Extract the time bin size from a neutron imaging filename.

    Parses specially formatted filenames to extract the time bin size used
    for time-of-flight measurements. The expected format includes 't' followed
    by the bin size, then 'T'. Handles scientific notation with automatic
    correction for common formatting issues.

    Args:
        file_name (str): Filename containing time bin information in the format
                        '...t{bin_size}T...'. Example:
                        'image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff'
                        Scientific notation like '1e6' is supported and corrected to '1e-6'.

    Returns:
        float: Time bin size in seconds (typically microseconds as 1e-6)

    Example:
        >>> filename = "image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff"
        >>> bin_size = retrieve_time_bin_size_from_file_name(filename)
        >>> bin_size
        1e-06

        >>> filename = "data_t0.001T500p.fits"
        >>> bin_size = retrieve_time_bin_size_from_file_name(filename)
        >>> bin_size
        0.001

    Raises:
        ValueError: If the filename doesn't contain required 't' and 'T' markers
        ValueError: If the extracted value cannot be converted to a float

    Note:
        - The function looks for the pattern 't{number}T' in the filename
        - Automatically corrects 'e' to 'e-' in scientific notation (common formatting)
        - Only the basename of the file is considered (path is ignored)
        - Supports both decimal and scientific notation
    """

    # Extract basename to work with filename only
    base_name = os.path.basename(file_name)

    if "t" in base_name and "T" in base_name:
        try:
            _uncorrected_value = base_name.split("t")[1].split("T")[0]
            # add - after "e" to correct the value for scientific notation
            if "e" in str(_uncorrected_value):
                _corrected_value = str(_uncorrected_value).replace("e", "e-")
                return float(_corrected_value)
            return float(_uncorrected_value)
        except (IndexError, ValueError) as e:
            raise ValueError(f"Could not extract time bin size from file name: {file_name}") from e
    else:
        raise ValueError(f"File name does not contain required 't' and 'T' markers: {file_name}")




[docs]
def export_ascii(data_dict: Dict[str, Union[List, Any]], file_path: str) -> None:
    """
    Export processed data to a tab-separated ASCII file.

    Converts a dictionary of data arrays to a formatted ASCII file suitable
    for analysis in external tools. The output uses tab separation with
    column headers for easy import into spreadsheet or analysis software.

    Args:
        data_dict (Dict[str, Union[List, Any]]): Dictionary containing data to export.
                                               Keys become column headers, values become data columns.
                                               All values should be array-like with the same length.
        file_path (str): Path to the output ASCII file. Parent directories will be
                        created if they don't exist.

    Example:
        Basic export:
        >>> data = {
        ...     "energy_eV": [1.0, 2.0, 3.0],
        ...     "transmission": [0.8, 0.6, 0.4],
        ...     "uncertainties": [0.1, 0.08, 0.06]
        ... }
        >>> export_ascii(data, "transmission_results.txt")
        Data exported to transmission_results.txt

        Output file format:
        energy_eV	transmission	uncertainties
        1.0	0.8	0.1
        2.0	0.6	0.08
        3.0	0.4	0.06

    Raises:
        ValueError: If data_dict is empty or contains mismatched array lengths
        IOError: If file cannot be written (permissions, disk space, etc.)
        KeyError: If data_dict contains invalid data types

    Note:
        - Uses tab separation for easy import into analysis software
        - Includes column headers in the first row
        - Creates parent directories if they don't exist
        - Overwrites existing files without warning
        - All data columns must have the same length
    """
    # Validate input
    if not data_dict:
        raise ValueError("Data dictionary cannot be empty")

    # Create parent directories if they don't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    try:
        df = pd.DataFrame(data_dict)
        df.to_csv(file_path, sep="\t", index=False, header=True)
        print(f"Data exported to {file_path}")
        logger.info(f"Data exported to {file_path}")
    except Exception as e:
        error_msg = f"Failed to export data to {file_path}: {str(e)}"
        logger.error(error_msg)
        raise IOError(error_msg) from e