Source code for pyef.cli

"""Command-line interface (CLI) entry point"""

import os
import yaml
import click
from .validation import (
    check_path_exists,
    validate_charge_type,
    validate_numeric_range,
    check_index_overlap,
    VALID_CHARGE_TYPES
)

[docs]def welcome(): """Print first to welcome the user while it waits to load the modules""" print("\n") print(" ╔═══════════════════════╗") print(" ║ ┌──────┐ ║") print(" ║ │ ┌───┘ ║") print(" ║ │ └───┐ ║") print(" ║ │ ┌───┘ ║") print(" ║ │ ├┬┬┬┐ ║") print(" ║ └──┴┴┴┴┘ ║") print(" ║ ║") print(" ║ WELCOME TO PYEF ║") print(" ╚═══════════╗╔══════════╝") print(" ╔═══════╝╚═══════╗ ") print(" ║ THE KULIK LAB ║ ") print(" ╚═══════╗╔═══════╝ ") print(" ╔══════════════════════╝╚══════════════════════╗ ") print(" ║ Code: github.com/davidkastner/pyef ║ ") print(" ║ Docs: pyef.readthedocs.io ║ ") print(" ║ - Usage: pyef -c pyef.in ║ ") print(" ║ Example: github.com/davidkastner/pyef/demo ║ ") print(" ╚══════════════════════════════════════════════╝ \n")
# Welcome even if no flags welcome() # Read in the configuration yaml file
[docs]def read_config(config_file): with open(config_file, 'r') as file: return yaml.safe_load(file)
[docs]def parse_job_batch_file(file_path): """ Parse a CSV file and extract specific columns as lists and tuples. The input file allows Python style comments on any line. Parameters ---------- file_path : str The file path of the CSV file to be parsed. Returns ------- analysis_types : list of str List of analysis types ('ef', 'esp', 'estab', or combinations like 'ef+esp'). molden_paths : list of str List of paths to .molden files. xyz_paths : list of str List of paths to .xyz files. metal_indices : list of int or None List containing metal atom indices. None if metal indices are not provided. bond_indices : list of tuples or None List of bond tuples for each job. None if bond indices are not provided. Notes ----- New format (required): - analysis_type, path_to_molden, path_to_xyz, [bond_tuples or metal_index] - Examples: - ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26), (25, 27) - esp, /path/to/optim.molden, /path/to/optim.xyz, 30 - estab, /path/to/optim.molden, /path/to/optim.xyz - ef+esp, /path/to/optim.molden, /path/to/optim.xyz, 35 """ import re analysis_types = [] molden_paths = [] xyz_paths = [] metal_indices = [] bond_indices = [] # Valid analysis type keywords valid_analysis = {'ef', 'esp', 'estab'} with open(file_path, 'r') as file: for line_num, line in enumerate(file, 1): # Skip empty lines and comments that start with '#' if line.strip() == '' or line.strip().startswith('#'): continue # Remove comments from the line line = line.split('#')[0].strip() # Skip the line if it's empty after removing the comment if line == '': continue # Split line into columns (initially by comma) columns = [col.strip() for col in line.split(',')] # Validate minimum number of columns if len(columns) < 3: raise ValueError(f""" {'='*60} ERROR: Invalid CSV format on line {line_num} {'='*60} Expected at least 3 columns, got {len(columns)} Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices] Current line: {line} Examples of correct format: ef, /path/to/job.molden, /path/to/job.xyz, (25, 26) esp, /path/to/job.molden, /path/to/job.xyz, 30 estab, /path/to/job.molden, /path/to/job.xyz ef+esp, /path/to/job.molden, /path/to/job.xyz, (10, 11) For more examples, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml (lines 67-74) - README.md: Section 2.2 (Job Batch File Format) {'='*60} """) # Extract analysis type (column 1) analysis_type = columns[0].lower().strip() analysis_parts = set(analysis_type.replace('+', ' ').split()) if not analysis_parts.issubset(valid_analysis) or not analysis_parts: raise ValueError(f""" {'='*60} ERROR: Invalid analysis type on line {line_num} {'='*60} Specified: '{columns[0]}' Valid options: {', '.join(sorted(valid_analysis))} You can also combine analyses with '+': ef+esp # Run both electric field and ESP ef+estab # Run electric field and stabilization Examples: ef, /path/to/job.molden, /path/to/job.xyz, (25, 26) esp, /path/to/job.molden, /path/to/job.xyz, 30 estab, /path/to/job.molden, /path/to/job.xyz For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.2 (Job Batch File Format) {'='*60} """) analysis_types.append(columns[0]) # Keep original case # Extract molden path (column 2) molden_path = columns[1].strip() if not molden_path: raise ValueError(f""" {'='*60} ERROR: Missing molden file path on line {line_num} {'='*60} Column 2 is empty or missing. Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices] Example: ef, /path/to/job/optim.molden, /path/to/job/optim.xyz, 25 For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.2 (Job Batch File Format) {'='*60} """) molden_paths.append(molden_path) # Extract xyz path (column 3) xyz_path = columns[2].strip() if not xyz_path: raise ValueError(f""" {'='*60} ERROR: Missing XYZ file path on line {line_num} {'='*60} Column 3 is empty or missing. Format: analysis_type, path_to_molden, path_to_xyz, [atom_indices] Example: ef, /path/to/job/optim.molden, /path/to/job/optim.xyz, 25 For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.2 (Job Batch File Format) {'='*60} """) xyz_paths.append(xyz_path) # Parse remaining columns for metal indices and/or bond tuples remainder = ','.join(columns[3:]) has_tuples = '(' in remainder and ')' in remainder if has_tuples: # Parse tuple format for bonds job_bonds = [] metal_idx = None tuple_pattern = r'\(\s*(\d+)\s*,\s*(\d+)\s*\)' matches = re.findall(tuple_pattern, remainder) for match in matches: atom1, atom2 = int(match[0]), int(match[1]) job_bonds.append((atom1, atom2)) # Set metal_idx to first atom of first bond (for compatibility) if metal_idx is None: metal_idx = atom1 metal_indices.append(metal_idx) bond_indices.append(job_bonds) elif len(columns) > 3 and columns[3]: # Single metal index try: metal_index = int(columns[3]) metal_indices.append(metal_index) # Check if there's a bonded atom index if len(columns) > 4 and columns[4]: bonded_atom_index = int(columns[4]) bond_indices.append([(metal_index, bonded_atom_index)]) else: bond_indices.append([]) except ValueError: raise ValueError(f""" {'='*60} ERROR: Invalid atom index format on line {line_num} {'='*60} Specified: '{columns[3]}' Expected: integer or tuple format Valid formats: - Single atom index: 25 - Bond tuple: (25, 26) - Multiple bonds: (25, 26), (25, 27) Examples: ef, /path/to/job.molden, /path/to/job.xyz, (25, 26) esp, /path/to/job.molden, /path/to/job.xyz, 30 Note: Atom indices are 0-based (first atom = 0) For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.2 (Job Batch File Format) {'='*60} """) else: # No additional data metal_indices.append(None) bond_indices.append([]) # Return None for metal/bond indices if all are empty/None if metal_indices and all(m is None for m in metal_indices): metal_indices = None if bond_indices and all(not b for b in bond_indices): bond_indices = None return analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices
[docs]def read_file_lines(file_path): """Reads in auxiliary files containing job information""" with open(file_path, 'r') as file: return [line.strip() for line in file.readlines()]
[docs]def run_analysis(job_name, molden_paths, xyz_paths, analysis_types, metal_indices, bond_indices, dielectric, geom_flag, multiwfn_path, charge_types=['Hirshfeld_I'], multipole_bool=True, use_multipole=False, save_atomwise_decomposition=False, substrate_idxs=None, env_idxs=None, multipole_order=2, include_ptchgs=False, ptchg_file='', dielectric_scale=1.0, use_ecp=False, exclude_atoms=[]): """ Main function for running the pyEF workflow. Parameters ---------- job_name : str Name for output files molden_paths : list of str List of paths to .molden files xyz_paths : list of str List of paths to .xyz files analysis_types : list of str List of analysis types per job ('ef', 'esp', 'estab', or combinations) metal_indices : list of int or None List of metal atom indices (only required for ESP analysis or auto-finding bonds) bond_indices : list of tuples List of bond pairs (required for E-field if metal_indices not provided) dielectric : float Dielectric constant geom_flag : bool Run geometry checks multiwfn_path : str Path to Multiwfn executable charge_types : list of str, optional Charge partitioning schemes (default: ['Hirshfeld_I']) multipole_bool : bool, optional Use multipole expansion for E-field (default: True) use_multipole : bool, optional Use multipole expansion for ESP (default: False) save_atomwise_decomposition : bool, optional Save atom-wise decomposition to CSV (default: False) substrate_idxs : list or None, optional Substrate atom indices for estab calculation (default: None) env_idxs : list or None, optional Environment atom indices for estab calculation (default: None) multipole_order : int, optional Multipole expansion order (default: 2) include_ptchgs : bool, optional Include MM point charges from QM/MM calculations (default: False) ptchg_file : str, optional Filename of point charge file (default: '') File is looked for in the same directory as each job's molden file. Example: 'pointcharges.txt' will look for this file in each job directory. dielectric_scale : float, optional Dielectric scaling factor for MM charges (default: 1.0) use_ecp : bool, optional Use effective core potential (ECP) correction for molden files (default: False) exclude_atoms : list of int or list of lists, optional Atom indices to exclude from E-field calculations (default: []) Can be either: - Flat list [0, 1, 2] for same exclusions across all jobs - Nested list [[0, 1], [2, 3], []] for different exclusions per job """ from pyef.analysis import Electrostatics from collections import defaultdict # Validate inputs if len(molden_paths) != len(xyz_paths) or len(molden_paths) != len(analysis_types): raise ValueError("molden_paths, xyz_paths, and analysis_types must have the same length") # Determine if using per-job analysis types (always True in new format) use_per_job_analysis = True # Group jobs by analysis type for efficient batch processing analysis_groups = defaultdict(list) # {analysis_type: [job_indices, ...]} for idx, analysis_type in enumerate(analysis_types): if analysis_type: # Handle combined analysis types (e.g., 'ef+esp') for atype in analysis_type.lower().replace('+', ' ').split(): analysis_groups[atype].append(idx) # Process each analysis type for analysis_type, job_indices in analysis_groups.items(): # Filter data for this analysis type filtered_molden = [molden_paths[i] for i in job_indices] filtered_xyz = [xyz_paths[i] for i in job_indices] filtered_metals = [metal_indices[i] if metal_indices else None for i in job_indices] filtered_bonds = [bond_indices[i] if bond_indices else [] for i in job_indices] print(f"\n{'='*60}") print(f"Running {analysis_type.upper()} analysis on {len(filtered_molden)} jobs") print(f"{'='*60}\n") # Initialize Electrostatics Object for this group incage_bool = False if any(filtered_metals): dataObject = Electrostatics(filtered_molden, filtered_xyz, esp_atom_idx=filtered_metals, incage_bool=incage_bool, dielectric=dielectric) else: dataObject = Electrostatics(filtered_molden, filtered_xyz, incage_bool=incage_bool, dielectric=dielectric) # Configure QM/MM if requested if include_ptchgs and ptchg_file: dataObject.includePtChgs(ptchg_file) dataObject.set_dielectric_scale(dielectric_scale) # Set excluded atoms if specified if exclude_atoms: dataObject.setExcludeAtomFromCalc(exclude_atoms) # Prepare data dataObject.prepData() if use_ecp: dataObject.fix_allECPmolden() # Run geometry check if requested if geom_flag: print("Warning: geometry checks are not implemented yet; skipping.") # Run the appropriate analysis if analysis_type == 'esp': # ESP can use multiple charge types, so we'll use the first one for the filename chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types ESPdata_filename = f'{analysis_type}_{job_name}_{chg_type}' dataObject.getESP(charge_types, ESPdata_filename, multiwfn_path, use_multipole=use_multipole, dielectric=dielectric) elif analysis_type == 'ef': chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types Efielddata_filename = f'{analysis_type}_{job_name}_{chg_type}' dataObject.getEfield(chg_type, Efielddata_filename, multiwfn_path, multipole_bool=multipole_bool, input_bond_indices=filtered_bonds, save_atomwise_decomposition=save_atomwise_decomposition, dielectric=dielectric) elif analysis_type == 'estab': if substrate_idxs is None: print(f"Warning: substrate_idxs not specified for {analysis_type}. Skipping...") else: chg_type = charge_types[0] if isinstance(charge_types, list) else charge_types estab_filename = f'{analysis_type}_{job_name}_{chg_type}' dataObject.getElectrostatic_stabilization( multiwfn_path, substrate_idxs=substrate_idxs, charge_type=chg_type, name_dataStorage=estab_filename, env_idxs=env_idxs, save_atomwise_decomposition=save_atomwise_decomposition, multipole_order=multipole_order )
@click.group(invoke_without_command=True) @click.option("--config", "-c", type=click.Path(exists=True), help="Path to the configuration YAML file") @click.pass_context def cli(ctx, config): """CLI entry point""" if ctx.invoked_subcommand is None and config: ctx.invoke(run, config=config) elif ctx.invoked_subcommand is None: click.echo(ctx.get_help()) @click.command() @click.option("--config", "-c", required=True, type=click.Path(exists=True), help="Path to the configuration YAML file") def run(config): config_data = read_config(config) input = config_data.get('input', '') dielectric = config_data.get('dielectric', 1) run_ef = config_data.get('ef', False) run_esp = config_data.get('esp', False) run_estab = config_data.get('estab', False) run_geometry_check = config_data.get('geometry_check', False) multiwfn_path = config_data.get('multiwfn_path', False) # Additional configuration options charge_types = config_data.get('charge_types', ['Hirshfeld_I']) multipole_bool = config_data.get('multipole', True) use_multipole = config_data.get('use_multipole', False) save_atomwise_decomposition = config_data.get('save_atomwise_decomposition', False) substrate_idxs = config_data.get('substrate_idxs', None) env_idxs = config_data.get('env_idxs', None) multipole_order = config_data.get('multipole_order', 2) # QM/MM options include_ptchgs = config_data.get('include_ptchgs', False) ptchg_file = config_data.get('ptchg_file', '') dielectric_scale = config_data.get('dielectric_scale', 1.0) # ECP (Effective Core Potential) option use_ecp = config_data.get('use_ecp', False) # Exclude atoms from E-field calculation exclude_atoms = config_data.get('exclude_atoms', []) # ===== VALIDATE CONFIG PARAMETERS ===== # Validate multiwfn_path exists and is executable if not multiwfn_path: raise ValueError(f""" {'='*60} ERROR: Missing multiwfn_path in configuration {'='*60} The 'multiwfn_path' parameter is required but not specified. Multiwfn is required for all pyEF calculations. Example in config.yaml: multiwfn_path: /usr/local/bin/Multiwfn For installation instructions, see: - README.md: Section 4 (Installation) - https://pyef.readthedocs.io/ {'='*60} """) check_path_exists(multiwfn_path, path_type="file", context="multiwfn_path") # Validate input file exists if not input: raise ValueError(f""" {'='*60} ERROR: Missing input file in configuration {'='*60} The 'input' parameter is required but not specified. Example in config.yaml: input: jobs.csv For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2 (Quick Start Guide) {'='*60} """) check_path_exists(input, path_type="file", context="input file") # Validate dielectric constant validate_numeric_range(dielectric, "dielectric", min_val=0.001, context="config file") # Validate multipole_order validate_numeric_range(multipole_order, "multipole_order", allowed_values=[1, 2, 3], context="config file") # Validate dielectric_scale validate_numeric_range(dielectric_scale, "dielectric_scale", min_val=0.001, context="config file") # Validate charge types if not isinstance(charge_types, list): raise ValueError(f""" {'='*60} ERROR: Invalid charge_types format in config {'='*60} Expected: list of charge type strings Got: {type(charge_types).__name__} Example in config.yaml: charge_types: ['Hirshfeld_I'] # or charge_types: ['Hirshfeld', 'Becke'] For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) for charge_type in charge_types: validate_charge_type(charge_type, context="config file") # Validate substrate_idxs and env_idxs don't overlap (for estab calculations) if substrate_idxs is not None and env_idxs is not None: if not isinstance(substrate_idxs, list): raise ValueError(f""" {'='*60} ERROR: Invalid substrate_idxs format {'='*60} Expected: list of integers Got: {type(substrate_idxs).__name__} Example in config.yaml: substrate_idxs: [0, 1, 2, 3] For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) if not isinstance(env_idxs, list): raise ValueError(f""" {'='*60} ERROR: Invalid env_idxs format {'='*60} Expected: list of integers Got: {type(env_idxs).__name__} Example in config.yaml: env_idxs: [10, 11, 12, 13] For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) check_index_overlap(substrate_idxs, env_idxs, "substrate_idxs", "env_idxs", "electrostatic stabilization") # Validate QM/MM options if include_ptchgs and not ptchg_file: raise ValueError(f""" {'='*60} ERROR: Missing ptchg_file for QM/MM calculation {'='*60} include_ptchgs is set to true, but ptchg_file is not specified. Example in config.yaml: include_ptchgs: true ptchg_file: 'pointcharges.txt' Note: The file should exist in the same directory as each job's molden file. For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 3.4 (QM/MM Calculations) {'='*60} """) # ===== END VALIDATION ===== # Parse job batch file analysis_types, molden_paths, xyz_paths, metal_indices, bond_indices = parse_job_batch_file(input) job_name = os.path.splitext(input)[0] # New format always has analysis types specified per-job if not analysis_types: click.echo("Error: No valid jobs found in the input file.") click.echo("Expected format: analysis_type, path_to_molden, path_to_xyz [, atom_indices]") click.echo("Examples:") click.echo(" ef, /path/to/optim.molden, /path/to/optim.xyz, (25, 26)") click.echo(" esp, /path/to/optim.molden, /path/to/optim.xyz, 30") click.echo(" estab, /path/to/optim.molden, /path/to/optim.xyz") return # Run per-job analyses (new behavior - each job specifies its own analysis) run_analysis( job_name, molden_paths, xyz_paths, analysis_types, metal_indices, bond_indices, dielectric, run_geometry_check, multiwfn_path, charge_types, multipole_bool, use_multipole, save_atomwise_decomposition, substrate_idxs, env_idxs, multipole_order, include_ptchgs, ptchg_file, dielectric_scale, use_ecp, exclude_atoms ) cli.add_command(run) if __name__ == '__main__': cli()