Source code for pleiades.sammy.io.data_manager

"""
SAMMY data format management utilities.

This module provides functions for converting between different data formats
required by SAMMY, including the twenty-column fixed-width format used for
experimental transmission data.
"""

from pathlib import Path
from typing import Union

import numpy as np

from pleiades.utils.logger import loguru_logger

logger = loguru_logger.bind(name="sammy_data_manager")


[docs] def convert_csv_to_sammy_twenty(csv_file: Union[str, Path], twenty_file: Union[str, Path]) -> None: """ Convert transmission spectra from CSV to SAMMY twenty format. This function supports tab-, comma-, and space-separated files, with either two columns (energy, transmission) or three columns (energy, transmission, uncertainty). If only two columns are present, the uncertainty column will be filled with 0.0. Args: csv_file: Path to input CSV file with columns: energy_eV, transmission, [uncertainty] twenty_file: Path to output SAMMY twenty format file File Formats: Input CSV (tab, comma, or space separated): "energy_eV,transmission,uncertainty\n6.673,0.932,0.272\n" or "energy_eV\ttransmission\tuncertainty\n6.673\t0.932\t0.272\n" or "# Energy(eV) Transmission Uncertainty\n6.673240e+00 1.003460e+00 7.242967e-03\n" or "energy_eV,transmission\n6.673,0.932\n" Output twenty: " 6.6732397079 0.9323834777 0.2727669477\n" Example: >>> convert_csv_to_sammy_twenty( ... "transmission.txt", ... "transmission.twenty" ... ) >>> convert_csv_to_sammy_twenty( ... "ineuit.csv", ... "ineuit_transmission.twenty" ... ) """ logger.info(f"Converting {csv_file} to SAMMY twenty format: {twenty_file}") data = [] with open(csv_file, "r") as f: lines = f.readlines() # Skip header lines (comments starting with # or containing non-numeric first field) for line in lines: # Strip whitespace and skip empty lines line = line.strip() if not line: continue # Skip comment lines if line.startswith("#"): continue # Try to parse the line with different delimiters # First try splitting by whitespace (most common for scientific data) fields = line.split() # If that doesn't give us 2 or 3 fields, try comma if len(fields) not in [2, 3]: fields = line.split(",") # If still not right, try tab if len(fields) not in [2, 3]: fields = line.split("\t") # Skip lines that don't have the right number of fields if len(fields) not in [2, 3]: # Check if this might be a header line try: float(fields[0]) except (ValueError, IndexError): continue # Skip header lines logger.warning(f"Skipping line with {len(fields)} fields: {line[:50]}...") continue # Try to convert to floats try: numeric_fields = [float(field) for field in fields] data.append(numeric_fields) except ValueError: # This is likely a header line, skip it continue if not data: raise ValueError(f"No valid data found in {csv_file}") # Convert data to numpy array of floats data = np.array(data, dtype=float) # Handle for 2-columns (energy, transmission), and adding zero uncertainty column if data.shape[1] == 2: data = np.column_stack([data, np.zeros(data.shape[0])]) # If data is not 2 or 3 columns, raise error elif data.shape[1] != 3: raise ValueError(f"Expected 2 or 3 columns (energy, transmission, [uncertainty]), got {data.shape[1]}") # Check if output directory exists, create if not Path(twenty_file).parent.mkdir(parents=True, exist_ok=True) # Write to SAMMY twenty format (fixed-width columns) with open(twenty_file, "w") as f: for energy, transmission, uncertainty in data: f.write(f"{energy:20.10f}{transmission:20.10f}{uncertainty:20.10f}\n") logger.info(f"Converted {len(data)} data points to twenty format")
[docs] def validate_sammy_twenty_format(twenty_file: Union[str, Path]) -> bool: """ Validate that a file follows SAMMY twenty format requirements. Checks that each line has exactly 60 characters (3 columns × 20 chars each) and contains valid floating point data. Args: twenty_file: Path to file to validate Returns: bool: True if file is valid twenty format Example: >>> is_valid = validate_sammy_twenty_format("data.twenty") >>> print(f"File is valid: {is_valid}") """ try: with open(twenty_file, "r") as f: for line_num, line in enumerate(f, 1): # Remove newline for length check line_content = line.rstrip("\n\r") # Check line length (60 chars = 3 × 20-char columns) if len(line_content) != 60: logger.error(f"Line {line_num}: Expected 60 characters, got {len(line_content)}") return False # Try to parse as three floats try: energy = float(line_content[0:20]) transmission = float(line_content[20:40]) uncertainty = float(line_content[40:60]) except ValueError as e: logger.error(f"Line {line_num}: Could not parse as floats: {e}") return False logger.info(f"File {twenty_file} is valid SAMMY twenty format") return True except Exception as e: logger.error(f"Error validating {twenty_file}: {e}") return False