Source code for pyef.cli

"""Command-line interface (CLI) entry point"""

import os
import yaml
import click
from .validation import (
    check_path_exists,
    validate_charge_type,
    validate_numeric_range,
    check_index_overlap,
    VALID_CHARGE_TYPES
)

[docs]def welcome():
    """Print first to welcome the user while it waits to load the modules"""
    print("\n")
    print("             ╔═══════════════════════╗")
    print("             ║        ┌──────┐       ║")
    print("             ║        │  ┌───┘       ║")
    print("             ║        │  └───┐       ║")
    print("             ║        │  ┌───┘       ║")
    print("             ║        │  ├┬┬┬┐       ║")
    print("             ║        └──┴┴┴┴┘       ║")
    print("             ║                       ║")
    print("             ║    WELCOME TO PYEF    ║")
    print("             ╚═══════════╗╔══════════╝")
    print("                 ╔═══════╝╚═══════╗                 ")
    print("                 ║ THE KULIK LAB  ║                 ")
    print("                 ╚═══════╗╔═══════╝                 ")
    print("  ╔══════════════════════╝╚══════════════════════╗  ")
    print("  ║  Code: github.com/davidkastner/pyef          ║  ")
    print("  ║  Docs: pyef.readthedocs.io                   ║  ")
    print("  ║     - Usage: pyef -c pyef.in                 ║  ")
    print("  ║  Example: github.com/davidkastner/pyef/demo  ║  ")
    print("  ╚══════════════════════════════════════════════╝  \n")

# Welcome even if no flags
welcome()

# Read in the configuration yaml file
[docs]def read_config(config_file):
    with open(config_file, 'r') as file:
        return yaml.safe_load(file)

[docs]def parse_job_batch_file(file_path):
    """
    Parse a CSV file and extract specific columns as lists and tuples.
    The input file allows Python style comments on any line.

    Parameters
    ----------
    file_path : str
        The file path of the CSV file to be parsed.

    Returns
    -------
    analysis_types : list of str
        List of analysis types ('ef', 'esp', 'estab', or combinations like 'ef+esp').
    molden_paths : list of str
        List of paths to .molden files.
    xyz_paths : list of str
        List of paths to .xyz files.
    metal_indices : list of int or None
        List containing metal atom indices.
        None if metal indices are not provided.
    bond_indices : list of tuples or None
        List of bond tuples for each job.
        None if bond indices are not provided.

    Notes
    -----
    New format (required):
    - analysis_type, path_to_molden, path_to_xyz, [bond_tuples or metal_index]
    - Examples:
      - ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26), (25, 27)
      - esp, /path/to/optim.molden, /path/to/optim.xyz, 30
      - estab, /path/to/optim.molden, /path/to/optim.xyz
      - ef+esp, /path/to/optim.molden, /path/to/optim.xyz, 35
    """
    import re

    analysis_types = []
    molden_paths = []
    xyz_paths = []
    metal_indices = []
    bond_indices = []

    # Valid analysis type keywords
    valid_analysis = {'ef', 'esp', 'estab'}

    with open(file_path, 'r') as file:
        for line_num, line in enumerate(file, 1):
            # Skip empty lines and comments that start with '#'
            if line.strip() == '' or line.strip().startswith('#'):
                continue

            # Remove comments from the line
            line = line.split('#')[0].strip()

            # Skip the line if it's empty after removing the comment
            if line == '':
                continue

            # Split line into columns (initially by comma)
            columns = [col.strip() for col in line.split(',')]

            # Validate minimum number of columns
            if len(columns) < 3:
                raise ValueError(f"""
{'='*60}
ERROR: Invalid CSV format on line {line_num}
{'='*60}
Expected at least 3 columns, got {len(columns)}
Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices]

Current line: {line}

Examples of correct format:
  ef, /path/to/job.molden, /path/to/job.xyz, (25, 26)
  esp, /path/to/job.molden, /path/to/job.xyz, 30
  estab, /path/to/job.molden, /path/to/job.xyz
  ef+esp, /path/to/job.molden, /path/to/job.xyz, (10, 11)

For more examples, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml (lines 67-74)
- README.md: Section 2.2 (Job Batch File Format)
{'='*60}
""")

            # Extract analysis type (column 1)
            analysis_type = columns[0].lower().strip()
            analysis_parts = set(analysis_type.replace('+', ' ').split())
            if not analysis_parts.issubset(valid_analysis) or not analysis_parts:
                raise ValueError(f"""
{'='*60}
ERROR: Invalid analysis type on line {line_num}
{'='*60}
Specified: '{columns[0]}'
Valid options: {', '.join(sorted(valid_analysis))}

You can also combine analyses with '+':
  ef+esp    # Run both electric field and ESP
  ef+estab  # Run electric field and stabilization

Examples:
  ef, /path/to/job.molden, /path/to/job.xyz, (25, 26)
  esp, /path/to/job.molden, /path/to/job.xyz, 30
  estab, /path/to/job.molden, /path/to/job.xyz

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.2 (Job Batch File Format)
{'='*60}
""")
            analysis_types.append(columns[0])  # Keep original case

            # Extract molden path (column 2)
            molden_path = columns[1].strip()
            if not molden_path:
                raise ValueError(f"""
{'='*60}
ERROR: Missing molden file path on line {line_num}
{'='*60}
Column 2 is empty or missing.

Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices]

Example:
  ef, /path/to/job/optim.molden, /path/to/job/optim.xyz, 25

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.2 (Job Batch File Format)
{'='*60}
""")
            molden_paths.append(molden_path)

            # Extract xyz path (column 3)
            xyz_path = columns[2].strip()
            if not xyz_path:
                raise ValueError(f"""
{'='*60}
ERROR: Missing XYZ file path on line {line_num}
{'='*60}
Column 3 is empty or missing.

Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices]

Example:
  ef, /path/to/job/optim.molden, /path/to/job/optim.xyz, 25

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.2 (Job Batch File Format)
{'='*60}
""")
            xyz_paths.append(xyz_path)

            # Parse remaining columns for metal indices and/or bond tuples
            remainder = ','.join(columns[3:])
            has_tuples = '(' in remainder and ')' in remainder

            if has_tuples:
                # Parse tuple format for bonds
                job_bonds = []
                metal_idx = None

                tuple_pattern = r'\(\s*(\d+)\s*,\s*(\d+)\s*\)'
                matches = re.findall(tuple_pattern, remainder)

                for match in matches:
                    atom1, atom2 = int(match[0]), int(match[1])
                    job_bonds.append((atom1, atom2))
                    # Set metal_idx to first atom of first bond (for compatibility)
                    if metal_idx is None:
                        metal_idx = atom1

                metal_indices.append(metal_idx)
                bond_indices.append(job_bonds)

            elif len(columns) > 3 and columns[3]:
                # Single metal index
                try:
                    metal_index = int(columns[3])
                    metal_indices.append(metal_index)

                    # Check if there's a bonded atom index
                    if len(columns) > 4 and columns[4]:
                        bonded_atom_index = int(columns[4])
                        bond_indices.append([(metal_index, bonded_atom_index)])
                    else:
                        bond_indices.append([])
                except ValueError:
                    raise ValueError(f"""
{'='*60}
ERROR: Invalid atom index format on line {line_num}
{'='*60}
Specified: '{columns[3]}'
Expected: integer or tuple format

Valid formats:
  - Single atom index: 25
  - Bond tuple: (25, 26)
  - Multiple bonds: (25, 26), (25, 27)

Examples:
  ef, /path/to/job.molden, /path/to/job.xyz, (25, 26)
  esp, /path/to/job.molden, /path/to/job.xyz, 30

Note: Atom indices are 0-based (first atom = 0)

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.2 (Job Batch File Format)
{'='*60}
""")
            else:
                # No additional data
                metal_indices.append(None)
                bond_indices.append([])

    # Return None for metal/bond indices if all are empty/None
    if metal_indices and all(m is None for m in metal_indices):
        metal_indices = None
    if bond_indices and all(not b for b in bond_indices):
        bond_indices = None

    return analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices

[docs]def read_file_lines(file_path):
    """Reads in auxiliary files containing job information"""
    with open(file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

[docs]def run_analysis(job_name, molden_paths, xyz_paths, analysis_types, metal_indices, bond_indices, dielectric,
         geom_flag,
         multiwfn_path,
         charge_types=['Hirshfeld_I'], multipole_bool=True, use_multipole=False,
         save_atomwise_decomposition=False, substrate_idxs=None, env_idxs=None, multipole_order=2,
         include_ptchgs=False, ptchg_file='', dielectric_scale=1.0, use_ecp=False, exclude_atoms=[]):
    """
    Main function for running the pyEF workflow.

    Parameters
    ----------
    job_name : str
        Name for output files
    molden_paths : list of str
        List of paths to .molden files
    xyz_paths : list of str
        List of paths to .xyz files
    analysis_types : list of str
        List of analysis types per job ('ef', 'esp', 'estab', or combinations)
    metal_indices : list of int or None
        List of metal atom indices (only required for ESP analysis or auto-finding bonds)
    bond_indices : list of tuples
        List of bond pairs (required for E-field if metal_indices not provided)
    dielectric : float
        Dielectric constant
    geom_flag : bool
        Run geometry checks
    multiwfn_path : str
        Path to Multiwfn executable
    charge_types : list of str, optional
        Charge partitioning schemes (default: ['Hirshfeld_I'])
    multipole_bool : bool, optional
        Use multipole expansion for E-field (default: True)
    use_multipole : bool, optional
        Use multipole expansion for ESP (default: False)
    save_atomwise_decomposition : bool, optional
        Save atom-wise decomposition to CSV (default: False)
    substrate_idxs : list or None, optional
        Substrate atom indices for estab calculation (default: None)
    env_idxs : list or None, optional
        Environment atom indices for estab calculation (default: None)
    multipole_order : int, optional
        Multipole expansion order (default: 2)
    include_ptchgs : bool, optional
        Include MM point charges from QM/MM calculations (default: False)
    ptchg_file : str, optional
        Filename of point charge file (default: '')
        File is looked for in the same directory as each job's molden file.
        Example: 'pointcharges.txt' will look for this file in each job directory.
    dielectric_scale : float, optional
        Dielectric scaling factor for MM charges (default: 1.0)
    use_ecp : bool, optional
        Use effective core potential (ECP) correction for molden files (default: False)
    exclude_atoms : list of int or list of lists, optional
        Atom indices to exclude from E-field calculations (default: [])
        Can be either:
        - Flat list [0, 1, 2] for same exclusions across all jobs
        - Nested list [[0, 1], [2, 3], []] for different exclusions per job
    """
    from pyef.analysis import Electrostatics
    from collections import defaultdict

    # Validate inputs
    if len(molden_paths) != len(xyz_paths) or len(molden_paths) != len(analysis_types):
        raise ValueError("molden_paths, xyz_paths, and analysis_types must have the same length")

    # Determine if using per-job analysis types (always True in new format)
    use_per_job_analysis = True

    # Group jobs by analysis type for efficient batch processing
    analysis_groups = defaultdict(list)  # {analysis_type: [job_indices, ...]}

    for idx, analysis_type in enumerate(analysis_types):
        if analysis_type:
            # Handle combined analysis types (e.g., 'ef+esp')
            for atype in analysis_type.lower().replace('+', ' ').split():
                analysis_groups[atype].append(idx)

    # Process each analysis type
    for analysis_type, job_indices in analysis_groups.items():
        # Filter data for this analysis type
        filtered_molden = [molden_paths[i] for i in job_indices]
        filtered_xyz = [xyz_paths[i] for i in job_indices]
        filtered_metals = [metal_indices[i] if metal_indices else None for i in job_indices]
        filtered_bonds = [bond_indices[i] if bond_indices else [] for i in job_indices]

        print(f"\n{'='*60}")
        print(f"Running {analysis_type.upper()} analysis on {len(filtered_molden)} jobs")
        print(f"{'='*60}\n")

        # Initialize Electrostatics Object for this group
        incage_bool = False
        if any(filtered_metals):
            dataObject = Electrostatics(filtered_molden, filtered_xyz,
                                      esp_atom_idx=filtered_metals,
                                      incage_bool=incage_bool, dielectric=dielectric)
        else:
            dataObject = Electrostatics(filtered_molden, filtered_xyz,
                                      incage_bool=incage_bool, dielectric=dielectric)

        # Configure QM/MM if requested
        if include_ptchgs and ptchg_file:
            dataObject.includePtChgs(ptchg_file)
            dataObject.set_dielectric_scale(dielectric_scale)

        # Set excluded atoms if specified
        if exclude_atoms:
            dataObject.setExcludeAtomFromCalc(exclude_atoms)

        # Prepare data
        dataObject.prepData()
        if use_ecp:
            dataObject.fix_allECPmolden()

        # Run geometry check if requested
        if geom_flag:
            print("Warning: geometry checks are not implemented yet; skipping.")

        # Run the appropriate analysis
        if analysis_type == 'esp':
            # ESP can use multiple charge types, so we'll use the first one for the filename
            chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types
            ESPdata_filename = f'{analysis_type}_{job_name}_{chg_type}'
            dataObject.getESP(charge_types, ESPdata_filename,
                           multiwfn_path, use_multipole=use_multipole,
                           dielectric=dielectric)

        elif analysis_type == 'ef':
            chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types
            Efielddata_filename = f'{analysis_type}_{job_name}_{chg_type}'
            dataObject.getEfield(chg_type,
                               Efielddata_filename, multiwfn_path,
                               multipole_bool=multipole_bool, input_bond_indices=filtered_bonds,
                               save_atomwise_decomposition=save_atomwise_decomposition, dielectric=dielectric)

        elif analysis_type == 'estab':
            if substrate_idxs is None:
                print(f"Warning: substrate_idxs not specified for {analysis_type}. Skipping...")
            else:
                chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types
                estab_filename = f'{analysis_type}_{job_name}_{chg_type}'
                dataObject.getElectrostatic_stabilization(
                    multiwfn_path,
                    substrate_idxs=substrate_idxs,
                    charge_type=chg_type,
                    name_dataStorage=estab_filename,
                    env_idxs=env_idxs,
                    save_atomwise_decomposition=save_atomwise_decomposition,
                    multipole_order=multipole_order
                )

@click.group(invoke_without_command=True)
@click.option("--config", "-c", type=click.Path(exists=True), help="Path to the configuration YAML file")
@click.pass_context
def cli(ctx, config):
    """CLI entry point"""
    if ctx.invoked_subcommand is None and config:
        ctx.invoke(run, config=config)
    elif ctx.invoked_subcommand is None:
        click.echo(ctx.get_help())

@click.command()
@click.option("--config", "-c", required=True, type=click.Path(exists=True), help="Path to the configuration YAML file")
def run(config):
    config_data = read_config(config)
    input = config_data.get('input', '')
    dielectric = config_data.get('dielectric', 1)
    run_ef = config_data.get('ef', False)
    run_esp = config_data.get('esp', False)
    run_estab = config_data.get('estab', False)
    run_geometry_check = config_data.get('geometry_check', False)
    multiwfn_path = config_data.get('multiwfn_path', False)
    # Additional configuration options
    charge_types = config_data.get('charge_types', ['Hirshfeld_I'])
    multipole_bool = config_data.get('multipole', True)
    use_multipole = config_data.get('use_multipole', False)
    save_atomwise_decomposition = config_data.get('save_atomwise_decomposition', False)
    substrate_idxs = config_data.get('substrate_idxs', None)
    env_idxs = config_data.get('env_idxs', None)
    multipole_order = config_data.get('multipole_order', 2)

    # QM/MM options
    include_ptchgs = config_data.get('include_ptchgs', False)
    ptchg_file = config_data.get('ptchg_file', '')
    dielectric_scale = config_data.get('dielectric_scale', 1.0)

    # ECP (Effective Core Potential) option
    use_ecp = config_data.get('use_ecp', False)

    # Exclude atoms from E-field calculation
    exclude_atoms = config_data.get('exclude_atoms', [])

    # ===== VALIDATE CONFIG PARAMETERS =====

    # Validate multiwfn_path exists and is executable
    if not multiwfn_path:
        raise ValueError(f"""
{'='*60}
ERROR: Missing multiwfn_path in configuration
{'='*60}
The 'multiwfn_path' parameter is required but not specified.

Multiwfn is required for all pyEF calculations.

Example in config.yaml:
  multiwfn_path: /usr/local/bin/Multiwfn

For installation instructions, see:
- README.md: Section 4 (Installation)
- https://pyef.readthedocs.io/
{'='*60}
""")
    check_path_exists(multiwfn_path, path_type="file", context="multiwfn_path")

    # Validate input file exists
    if not input:
        raise ValueError(f"""
{'='*60}
ERROR: Missing input file in configuration
{'='*60}
The 'input' parameter is required but not specified.

Example in config.yaml:
  input: jobs.csv

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2 (Quick Start Guide)
{'='*60}
""")
    check_path_exists(input, path_type="file", context="input file")

    # Validate dielectric constant
    validate_numeric_range(dielectric, "dielectric", min_val=0.001, context="config file")

    # Validate multipole_order
    validate_numeric_range(multipole_order, "multipole_order", allowed_values=[1, 2, 3], context="config file")

    # Validate dielectric_scale
    validate_numeric_range(dielectric_scale, "dielectric_scale", min_val=0.001, context="config file")

    # Validate charge types
    if not isinstance(charge_types, list):
        raise ValueError(f"""
{'='*60}
ERROR: Invalid charge_types format in config
{'='*60}
Expected: list of charge type strings
Got: {type(charge_types).__name__}

Example in config.yaml:
  charge_types: ['Hirshfeld_I']
  # or
  charge_types: ['Hirshfeld', 'Becke']

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

    for charge_type in charge_types:
        validate_charge_type(charge_type, context="config file")

    # Validate substrate_idxs and env_idxs don't overlap (for estab calculations)
    if substrate_idxs is not None and env_idxs is not None:
        if not isinstance(substrate_idxs, list):
            raise ValueError(f"""
{'='*60}
ERROR: Invalid substrate_idxs format
{'='*60}
Expected: list of integers
Got: {type(substrate_idxs).__name__}

Example in config.yaml:
  substrate_idxs: [0, 1, 2, 3]

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

        if not isinstance(env_idxs, list):
            raise ValueError(f"""
{'='*60}
ERROR: Invalid env_idxs format
{'='*60}
Expected: list of integers
Got: {type(env_idxs).__name__}

Example in config.yaml:
  env_idxs: [10, 11, 12, 13]

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

        check_index_overlap(substrate_idxs, env_idxs, "substrate_idxs", "env_idxs", "electrostatic stabilization")

    # Validate QM/MM options
    if include_ptchgs and not ptchg_file:
        raise ValueError(f"""
{'='*60}
ERROR: Missing ptchg_file for QM/MM calculation
{'='*60}
include_ptchgs is set to true, but ptchg_file is not specified.

Example in config.yaml:
  include_ptchgs: true
  ptchg_file: 'pointcharges.txt'

Note: The file should exist in the same directory as each job's
molden file.

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 3.4 (QM/MM Calculations)
{'='*60}
""")

    # ===== END VALIDATION =====

    # Parse job batch file
    analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices = parse_job_batch_file(input)
    job_name = os.path.splitext(input)[0]

    # New format always has analysis types specified per-job
    if not analysis_types:
        click.echo("Error: No valid jobs found in the input file.")
        click.echo("Expected format: analysis_type, path_to_molden, path_to_xyz [, atom_indices]")
        click.echo("Examples:")
        click.echo("  ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26)")
        click.echo("  esp, /path/to/optim.molden, /path/to/optim.xyz, 30")
        click.echo("  estab, /path/to/optim.molden, /path/to/optim.xyz")
        return

    # Run per-job analyses (new behavior - each job specifies its own analysis)
    run_analysis(
        job_name, molden_paths, xyz_paths, analysis_types, metal_indices, bond_indices, dielectric,
        run_geometry_check,
        multiwfn_path,
        charge_types, multipole_bool, use_multipole,
        save_atomwise_decomposition, substrate_idxs, env_idxs, multipole_order,
        include_ptchgs, ptchg_file, dielectric_scale, use_ecp, exclude_atoms
    )

cli.add_command(run)

if __name__ == '__main__':
    cli()