Source code for pyef.manage

"""Functions for managing files needed for pyEF"""

[docs]def parse_job_batch_file(file_path): """ Parse a CSV file and extract specific columns as lists and tuples. The input file allows Python style comments on any line. Parameters ---------- file_path : str The file path of the CSV file to be parsed. Returns ------- analysis_types : list of str List of analysis types ('ef', 'esp', 'estab', or combinations like 'ef+esp'). molden_paths : list of str List of paths to .molden files. xyz_paths : list of str List of paths to .xyz files. metal_indices : list of int or None List containing metal atom indices. None if metal indices are not provided. bond_indices : list of tuples or None List of bond tuples for each job. None if bond indices are not provided. Notes ----- New format (required): - analysis_type, path_to_molden, path_to_xyz, [bond_tuples or metal_index] - Examples: - ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26), (25, 27) - esp, /path/to/optim.molden, /path/to/optim.xyz, 30 - estab, /path/to/optim.molden, /path/to/optim.xyz - ef+esp, /path/to/optim.molden, /path/to/optim.xyz, 35 """ import re analysis_types = [] molden_paths = [] xyz_paths = [] metal_indices = [] bond_indices = [] # Valid analysis type keywords valid_analysis = {'ef', 'esp', 'estab'} with open(file_path, 'r') as file: for line_num, line in enumerate(file, 1): # Skip empty lines and comments that start with '#' if line.strip() == '' or line.strip().startswith('#'): continue # Remove comments from the line line = line.split('#')[0].strip() # Skip the line if it's empty after removing the comment if line == '': continue # Split line into columns (initially by comma) columns = [col.strip() for col in line.split(',')] # Validate minimum number of columns if len(columns) < 3: raise ValueError( f"Line {line_num}: Invalid format. Expected at least 3 columns: " f"analysis_type, path_to_molden, path_to_xyz [, atom_indices]. " f"Got {len(columns)} columns." ) # Extract analysis type (column 1) analysis_type = columns[0].lower().strip() analysis_parts = set(analysis_type.replace('+', ' ').split()) if not analysis_parts.issubset(valid_analysis) or not analysis_parts: raise ValueError( f"Line {line_num}: Invalid analysis type '{columns[0]}'. " f"Must be one of: {', '.join(valid_analysis)} or combinations like 'ef+esp'." ) analysis_types.append(columns[0]) # Keep original case # Extract molden path (column 2) molden_path = columns[1].strip() if not molden_path: raise ValueError(f"Line {line_num}: Missing molden file path in column 2.") molden_paths.append(molden_path) # Extract xyz path (column 3) xyz_path = columns[2].strip() if not xyz_path: raise ValueError(f"Line {line_num}: Missing xyz file path in column 3.") xyz_paths.append(xyz_path) # Parse remaining columns for metal indices and/or bond tuples remainder = ','.join(columns[3:]) has_tuples = '(' in remainder and ')' in remainder if has_tuples: # Parse tuple format for bonds job_bonds = [] metal_idx = None tuple_pattern = r'\(\s*(\d+)\s*,\s*(\d+)\s*\)' matches = re.findall(tuple_pattern, remainder) for match in matches: atom1, atom2 = int(match[0]), int(match[1]) job_bonds.append((atom1, atom2)) # Set metal_idx to first atom of first bond (for compatibility) if metal_idx is None: metal_idx = atom1 metal_indices.append(metal_idx) bond_indices.append(job_bonds) elif len(columns) > 3 and columns[3]: # Single metal index try: metal_index = int(columns[3]) metal_indices.append(metal_index) # Check if there's a bonded atom index if len(columns) > 4 and columns[4]: bonded_atom_index = int(columns[4]) bond_indices.append([(metal_index, bonded_atom_index)]) else: bond_indices.append([]) except ValueError: raise ValueError( f"Line {line_num}: Invalid atom index '{columns[3]}'. " f"Expected integer or tuple format like (1, 2)." ) else: # No additional data metal_indices.append(None) bond_indices.append([]) # Return None for metal/bond indices if all are empty/None if metal_indices and all(m is None for m in metal_indices): metal_indices = None if bond_indices and all(not b for b in bond_indices): bond_indices = None return analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices