Source code for pyef.manage

"""Functions for managing files needed for pyEF"""

[docs]def parse_job_batch_file(file_path):
    """
    Parse a CSV file and extract specific columns as lists and tuples.
    The input file allows Python style comments on any line.

    Parameters
    ----------
    file_path : str
        The file path of the CSV file to be parsed.

    Returns
    -------
    analysis_types : list of str
        List of analysis types ('ef', 'esp', 'estab', or combinations like 'ef+esp').
    molden_paths : list of str
        List of paths to .molden files.
    xyz_paths : list of str
        List of paths to .xyz files.
    metal_indices : list of int or None
        List containing metal atom indices.
        None if metal indices are not provided.
    bond_indices : list of tuples or None
        List of bond tuples for each job.
        None if bond indices are not provided.

    Notes
    -----
    New format (required):
    - analysis_type, path_to_molden, path_to_xyz, [bond_tuples or metal_index]
    - Examples:
      - ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26), (25, 27)
      - esp, /path/to/optim.molden, /path/to/optim.xyz, 30
      - estab, /path/to/optim.molden, /path/to/optim.xyz
      - ef+esp, /path/to/optim.molden, /path/to/optim.xyz, 35
    """
    import re

    analysis_types = []
    molden_paths = []
    xyz_paths = []
    metal_indices = []
    bond_indices = []

    # Valid analysis type keywords
    valid_analysis = {'ef', 'esp', 'estab'}

    with open(file_path, 'r') as file:
        for line_num, line in enumerate(file, 1):
            # Skip empty lines and comments that start with '#'
            if line.strip() == '' or line.strip().startswith('#'):
                continue

            # Remove comments from the line
            line = line.split('#')[0].strip()

            # Skip the line if it's empty after removing the comment
            if line == '':
                continue

            # Split line into columns (initially by comma)
            columns = [col.strip() for col in line.split(',')]

            # Validate minimum number of columns
            if len(columns) < 3:
                raise ValueError(
                    f"Line {line_num}: Invalid format. Expected at least 3 columns: "
                    f"analysis_type, path_to_molden, path_to_xyz [, atom_indices]. "
                    f"Got {len(columns)} columns."
                )

            # Extract analysis type (column 1)
            analysis_type = columns[0].lower().strip()
            analysis_parts = set(analysis_type.replace('+', ' ').split())
            if not analysis_parts.issubset(valid_analysis) or not analysis_parts:
                raise ValueError(
                    f"Line {line_num}: Invalid analysis type '{columns[0]}'. "
                    f"Must be one of: {', '.join(valid_analysis)} or combinations like 'ef+esp'."
                )
            analysis_types.append(columns[0])  # Keep original case

            # Extract molden path (column 2)
            molden_path = columns[1].strip()
            if not molden_path:
                raise ValueError(f"Line {line_num}: Missing molden file path in column 2.")
            molden_paths.append(molden_path)

            # Extract xyz path (column 3)
            xyz_path = columns[2].strip()
            if not xyz_path:
                raise ValueError(f"Line {line_num}: Missing xyz file path in column 3.")
            xyz_paths.append(xyz_path)

            # Parse remaining columns for metal indices and/or bond tuples
            remainder = ','.join(columns[3:])
            has_tuples = '(' in remainder and ')' in remainder

            if has_tuples:
                # Parse tuple format for bonds
                job_bonds = []
                metal_idx = None

                tuple_pattern = r'\(\s*(\d+)\s*,\s*(\d+)\s*\)'
                matches = re.findall(tuple_pattern, remainder)

                for match in matches:
                    atom1, atom2 = int(match[0]), int(match[1])
                    job_bonds.append((atom1, atom2))
                    # Set metal_idx to first atom of first bond (for compatibility)
                    if metal_idx is None:
                        metal_idx = atom1

                metal_indices.append(metal_idx)
                bond_indices.append(job_bonds)

            elif len(columns) > 3 and columns[3]:
                # Single metal index
                try:
                    metal_index = int(columns[3])
                    metal_indices.append(metal_index)

                    # Check if there's a bonded atom index
                    if len(columns) > 4 and columns[4]:
                        bonded_atom_index = int(columns[4])
                        bond_indices.append([(metal_index, bonded_atom_index)])
                    else:
                        bond_indices.append([])
                except ValueError:
                    raise ValueError(
                        f"Line {line_num}: Invalid atom index '{columns[3]}'. "
                        f"Expected integer or tuple format like (1, 2)."
                    )
            else:
                # No additional data
                metal_indices.append(None)
                bond_indices.append([])

    # Return None for metal/bond indices if all are empty/None
    if metal_indices and all(m is None for m in metal_indices):
        metal_indices = None
    if bond_indices and all(not b for b in bond_indices):
        bond_indices = None

    return analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices