Source code for pleiades.utils.files

"""
File utilities for PLEIADES neutron imaging data processing.

This module provides utilities for file discovery, metadata extraction, and data export
operations. It includes functions for finding image files with dominant extensions,
extracting timing information from filenames, and exporting processed data to ASCII format.

The module supports:
- Automatic file discovery with extension filtering
- Filename-based metadata extraction for neutron imaging files
- ASCII data export with proper formatting
- Robust error handling for file operations

Example:
    Basic file discovery and export:

    >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder("/path/to/data")
    >>> print(f"Found {len(files)} {ext} files")
    >>>
    >>> data_dict = {"energy": [1, 2, 3], "transmission": [0.8, 0.6, 0.4]}
    >>> export_ascii(data_dict, "output.txt")
"""

import glob
import os
from collections import Counter
from typing import Any, Dict, List, Tuple, Union

import pandas as pd

from pleiades.utils.logger import loguru_logger

logger = loguru_logger.bind(name="files")


[docs] def retrieve_list_of_most_dominant_extension_from_folder( folder: str = "", files: List[str] = None ) -> Tuple[List[str], str]: """ Find and return files with the most common extension from a folder or file list. Analyzes a folder or list of files to determine the most frequently occurring file extension, then returns all files with that extension. This is useful for automatically detecting the primary data format in imaging directories. Args: folder (str, optional): Path to folder to search for files. If provided, files parameter is ignored. Defaults to "". files (List[str], optional): List of file paths to analyze. Only used if folder is empty. Defaults to None. Returns: Tuple[List[str], str]: A tuple containing: - List of absolute file paths with the dominant extension, sorted alphabetically - The dominant file extension (e.g., '.tiff', '.fits') Example: From folder: >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder("/path/to/data") >>> print(f"Found {len(files)} files with extension {ext}") Found 100 files with extension .tiff From file list: >>> file_list = ["/path/file1.tiff", "/path/file2.tiff", "/path/file3.fits"] >>> files, ext = retrieve_list_of_most_dominant_extension_from_folder(files=file_list) >>> ext '.tiff' Note: - If folder is provided, it takes precedence over files parameter - Files are returned as absolute paths and sorted alphabetically - Extension counting is case-sensitive - Hidden files (starting with '.') are included in the search Raises: FileNotFoundError: If folder doesn't exist ValueError: If no files are found or all files lack extensions """ # Handle default mutable argument if files is None: files = [] if folder: if not os.path.exists(folder): raise FileNotFoundError(f"Folder does not exist: {folder}") list_of_input_files = glob.glob(os.path.join(folder, "*")) else: list_of_input_files = files if not list_of_input_files: raise ValueError("No files found to analyze") list_of_input_files.sort() list_of_base_name = [os.path.basename(_file) for _file in list_of_input_files] # work with the largest common file extension from the folder selected counter_extension = Counter() for _file in list_of_base_name: [_base, _ext] = os.path.splitext(_file) if _ext: # Only count files with extensions counter_extension[_ext] += 1 if not counter_extension: raise ValueError("No files with extensions found") dominand_extension = "" dominand_number = 0 for _key in counter_extension.keys(): if counter_extension[_key] > dominand_number: dominand_extension = _key dominand_number = counter_extension[_key] list_of_input_files = glob.glob(os.path.join(folder, "*" + dominand_extension)) list_of_input_files.sort() list_of_input_files = [os.path.abspath(_file) for _file in list_of_input_files] return (list_of_input_files, dominand_extension)
[docs] def retrieve_number_of_frames_from_file_name(file_name: str) -> int: """ Extract the number of time-of-flight frames from a neutron imaging filename. Parses specially formatted filenames to extract the number of time frames. The expected format includes 'T' followed by the frame count, then 'p'. This is commonly used in neutron imaging file naming conventions. Args: file_name (str): Filename containing frame information in the format '...T{frame_count}p...'. Example: 'image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff' Returns: int: Number of time-of-flight frames extracted from the filename Example: >>> filename = "image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff" >>> frames = retrieve_number_of_frames_from_file_name(filename) >>> frames 2000 >>> filename = "data_T500p.fits" >>> frames = retrieve_number_of_frames_from_file_name(filename) >>> frames 500 Raises: ValueError: If the filename doesn't contain required 'T' and 'p' markers ValueError: If the extracted value cannot be converted to an integer Note: - The function looks for the pattern 'T{number}p' in the filename - Only the basename of the file is considered (path is ignored) - The number must be a valid integer """ # Extract basename to work with filename only base_name = os.path.basename(file_name) # using regex-like string parsing to find the number of frames in the file name if "T" in base_name and "p" in base_name: try: return int(base_name.split("T")[1].split("p")[0]) except (IndexError, ValueError) as e: raise ValueError(f"Could not extract number of frames from file name: {file_name}") from e else: raise ValueError(f"File name does not contain required 'T' and 'p' markers: {file_name}")
[docs] def retrieve_time_bin_size_from_file_name(file_name: str) -> float: """ Extract the time bin size from a neutron imaging filename. Parses specially formatted filenames to extract the time bin size used for time-of-flight measurements. The expected format includes 't' followed by the bin size, then 'T'. Handles scientific notation with automatic correction for common formatting issues. Args: file_name (str): Filename containing time bin information in the format '...t{bin_size}T...'. Example: 'image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff' Scientific notation like '1e6' is supported and corrected to '1e-6'. Returns: float: Time bin size in seconds (typically microseconds as 1e-6) Example: >>> filename = "image_m2M9997Ex512y512t1e6T2000p1e6P100.tiff" >>> bin_size = retrieve_time_bin_size_from_file_name(filename) >>> bin_size 1e-06 >>> filename = "data_t0.001T500p.fits" >>> bin_size = retrieve_time_bin_size_from_file_name(filename) >>> bin_size 0.001 Raises: ValueError: If the filename doesn't contain required 't' and 'T' markers ValueError: If the extracted value cannot be converted to a float Note: - The function looks for the pattern 't{number}T' in the filename - Automatically corrects 'e' to 'e-' in scientific notation (common formatting) - Only the basename of the file is considered (path is ignored) - Supports both decimal and scientific notation """ # Extract basename to work with filename only base_name = os.path.basename(file_name) if "t" in base_name and "T" in base_name: try: _uncorrected_value = base_name.split("t")[1].split("T")[0] # add - after "e" to correct the value for scientific notation if "e" in str(_uncorrected_value): _corrected_value = str(_uncorrected_value).replace("e", "e-") return float(_corrected_value) return float(_uncorrected_value) except (IndexError, ValueError) as e: raise ValueError(f"Could not extract time bin size from file name: {file_name}") from e else: raise ValueError(f"File name does not contain required 't' and 'T' markers: {file_name}")
[docs] def export_ascii(data_dict: Dict[str, Union[List, Any]], file_path: str) -> None: """ Export processed data to a tab-separated ASCII file. Converts a dictionary of data arrays to a formatted ASCII file suitable for analysis in external tools. The output uses tab separation with column headers for easy import into spreadsheet or analysis software. Args: data_dict (Dict[str, Union[List, Any]]): Dictionary containing data to export. Keys become column headers, values become data columns. All values should be array-like with the same length. file_path (str): Path to the output ASCII file. Parent directories will be created if they don't exist. Example: Basic export: >>> data = { ... "energy_eV": [1.0, 2.0, 3.0], ... "transmission": [0.8, 0.6, 0.4], ... "uncertainties": [0.1, 0.08, 0.06] ... } >>> export_ascii(data, "transmission_results.txt") Data exported to transmission_results.txt Output file format: energy_eV transmission uncertainties 1.0 0.8 0.1 2.0 0.6 0.08 3.0 0.4 0.06 Raises: ValueError: If data_dict is empty or contains mismatched array lengths IOError: If file cannot be written (permissions, disk space, etc.) KeyError: If data_dict contains invalid data types Note: - Uses tab separation for easy import into analysis software - Includes column headers in the first row - Creates parent directories if they don't exist - Overwrites existing files without warning - All data columns must have the same length """ # Validate input if not data_dict: raise ValueError("Data dictionary cannot be empty") # Create parent directories if they don't exist os.makedirs(os.path.dirname(file_path), exist_ok=True) try: df = pd.DataFrame(data_dict) df.to_csv(file_path, sep="\t", index=False, header=True) print(f"Data exported to {file_path}") logger.info(f"Data exported to {file_path}") except Exception as e: error_msg = f"Failed to export data to {file_path}: {str(e)}" logger.error(error_msg) raise IOError(error_msg) from e