Source code for pyef.validation

"""
Validation utilities for pyEF.

This module provides centralized validation functions for input parameters,
file paths, and data formats. All validation functions raise ValueError with
helpful error messages that include examples and references to documentation.
"""

import os
from typing import Union, List, Optional, Any


# Valid charge partitioning schemes supported by pyEF
VALID_CHARGE_TYPES = {
    'Hirshfeld', 'Hirshfeld_I', 'Voronoi', 'Mulliken', 'Lowdin',
    'SCPA', 'Becke', 'ADCH', 'CHELPG', 'MK', 'AIM',
    'CM5', 'EEM', 'RESP', 'PEOE'
}

# Valid charge types for multipole analysis (only these work with Multiwfn multipole mode)
VALID_MULTIPOLE_CHARGE_TYPES = {
    'Hirshfeld', 'Hirshfeld_I', 'Becke'
}


[docs]def get_atom_count_from_xyz(xyz_path: str) -> int: """ Parse the number of atoms from an XYZ file. Parameters ---------- xyz_path : str Path to the XYZ file Returns ------- int Number of atoms in the structure Raises ------ ValueError If the file doesn't exist or has invalid format """ if not os.path.exists(xyz_path): raise ValueError(f""" {'='*60} ERROR: XYZ file not found {'='*60} Path: {xyz_path} The file does not exist. Please check the path. For help with file paths, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.2 (Input File Formats) {'='*60} """) try: with open(xyz_path, 'r') as f: first_line = f.readline().strip() atom_count = int(first_line) return atom_count except (ValueError, IOError) as e: raise ValueError(f""" {'='*60} ERROR: Invalid XYZ file format {'='*60} File: {xyz_path} Issue: {str(e)} XYZ format requires: - First line: number of atoms (integer) - Second line: comment - Following lines: element x y z Example XYZ file: 3 Water molecule O 0.000 0.000 0.000 H 0.757 0.586 0.000 H -0.757 0.586 0.000 For more information, see: - README.md: Section 2.2 (Input File Formats) {'='*60} """)
[docs]def validate_charge_type(charge_type: str, context: str = "") -> None: """ Validate that a charge type is in the set of supported schemes. Parameters ---------- charge_type : str The charge partitioning scheme to validate context : str, optional Additional context for the error message Raises ------ ValueError If the charge type is not valid """ if charge_type not in VALID_CHARGE_TYPES: context_str = f" in {context}" if context else "" # Find similar charge types (case-insensitive match) similar = [ct for ct in VALID_CHARGE_TYPES if ct.lower() == charge_type.lower()] suggestion = f"\nDid you mean: '{similar[0]}'? (Note: charge types are case-sensitive)" if similar else "" raise ValueError(f""" {'='*60} ERROR: Invalid charge partitioning scheme{context_str} {'='*60} Specified: '{charge_type}' Valid charge types (case-sensitive): - Hirshfeld (fast, good for most systems) - Hirshfeld_I (most accurate, slower) - CHELPG (for ESP fitting) - Becke (fast, less accurate) - Mulliken, Lowdin, SCPA, ADCH, Voronoi - MK, AIM, CM5, EEM, RESP, PEOE{suggestion} Example usage in config.yaml: charge_types: ['Hirshfeld_I'] Example usage in Python: esp_df = estat.getESP(['Hirshfeld_I'], 'output', multiwfn_path) For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2.3 (Key Parameters) {'='*60} """)
[docs]def filter_charge_types_for_multipole(charge_types: Union[str, List[str]], context: str = "") -> List[str]: """ Filter charge types to only those that work with multipole analysis. Only Hirshfeld, Hirshfeld_I, and Becke are supported for multipole analysis. This function validates the input, warns about unsupported types, and returns only the valid ones. Parameters ---------- charge_types : str or list of str Charge partitioning scheme(s) to validate context : str, optional Additional context for warning/error messages Returns ------- list of str Filtered list containing only valid multipole charge types. Returns empty list if no valid types are found. Warnings -------- Prints warnings for any charge types that don't support multipole analysis """ # Convert to list if string if isinstance(charge_types, str): charge_types = [charge_types] context_str = f" in {context}" if context else "" # First validate that all charge types are generally valid for charge_type in charge_types: validate_charge_type(charge_type, context=context) # Filter for multipole-compatible types valid_for_multipole = [] invalid_for_multipole = [] for charge_type in charge_types: if charge_type in VALID_MULTIPOLE_CHARGE_TYPES: valid_for_multipole.append(charge_type) else: invalid_for_multipole.append(charge_type) # Warn about invalid types if invalid_for_multipole: print(f""" {'='*60} WARNING: Invalid charge type(s) for multipole analysis{context_str} {'='*60} The following charge types do NOT support multipole analysis: {', '.join(invalid_for_multipole)} Only these charge types work with multipole analysis: Hirshfeld, Hirshfeld_I, Becke These charge types will be SKIPPED for multipole calculations. """) if valid_for_multipole: print(f"Valid charge types that will be used: {', '.join(valid_for_multipole)}") print(f"{'='*60}\n") # If no valid types remain, just return empty list (caller will handle) if not valid_for_multipole: print(f"No valid charge types remaining for multipole analysis{context_str}. Skipping.") print(f"{'='*60}\n") return valid_for_multipole
[docs]def filter_charge_types_for_monopole(charge_types: Union[str, List[str]], context: str = "") -> List[str]: """ Filter charge types to only those that work with monopole analysis. All charge types in VALID_CHARGE_TYPES are supported for monopole analysis. This function validates the input and returns only valid types. Parameters ---------- charge_types : str or list of str Charge partitioning scheme(s) to validate context : str, optional Additional context for warning/error messages Returns ------- list of str Filtered list containing only valid monopole charge types. Returns empty list if no valid types are found. Warnings -------- Prints warnings for any invalid charge types """ # Convert to list if string if isinstance(charge_types, str): charge_types = [charge_types] context_str = f" in {context}" if context else "" # Filter for valid monopole types valid_for_monopole = [] invalid_for_monopole = [] for charge_type in charge_types: if charge_type in VALID_CHARGE_TYPES: valid_for_monopole.append(charge_type) else: invalid_for_monopole.append(charge_type) # Warn about invalid types if invalid_for_monopole: print(f""" {'='*60} WARNING: Invalid charge type(s) for monopole analysis{context_str} {'='*60} The following charge types are NOT valid: {', '.join(invalid_for_monopole)} Valid charge types for monopole analysis: Hirshfeld, Hirshfeld_I, Voronoi, Mulliken, Lowdin, SCPA, Becke, ADCH, CHELPG, MK, AIM, CM5, EEM, RESP, PEOE These charge types will be SKIPPED. """) if valid_for_monopole: print(f"Valid charge types that will be used: {', '.join(valid_for_monopole)}") print(f"{'='*60}\n") # If no valid types remain, just return empty list (caller will handle) if not valid_for_monopole: print(f"No valid charge types remaining for monopole analysis{context_str}. Skipping.") print(f"{'='*60}\n") return valid_for_monopole
[docs]def check_path_exists(path: str, path_type: str = "file", context: str = "") -> None: """ Check if a file or directory path exists. Parameters ---------- path : str Path to check path_type : str Either "file" or "directory" context : str, optional Additional context for the error message Raises ------ ValueError If the path does not exist """ if not os.path.exists(path): context_str = f" ({context})" if context else "" if path_type == "file": raise ValueError(f""" {'='*60} ERROR: File not found{context_str} {'='*60} Path: {path} The file does not exist. Please check: 1. The path is correct 2. The file exists in the specified location 3. You have read permissions for the file For help with file paths, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml - README.md: Section 2 (Quick Start Guide) {'='*60} """) else: raise ValueError(f""" {'='*60} ERROR: Directory not found{context_str} {'='*60} Path: {path} The directory does not exist. Please check: 1. The path is correct 2. The directory exists 3. You have read permissions For help, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) # Additional check for executability if it's the Multiwfn path if context.lower() == "multiwfn_path" and path_type == "file": if not os.access(path, os.X_OK): raise ValueError(f""" {'='*60} ERROR: Multiwfn executable is not executable {'='*60} Path: {path} The file exists but is not executable. To fix: chmod +x {path} Multiwfn is required for all pyEF calculations. For installation instructions, see: - README.md: Section 4 (Installation) - https://pyef.readthedocs.io/ {'='*60} """)
[docs]def validate_numeric_range( value: Any, name: str, min_val: Optional[float] = None, max_val: Optional[float] = None, allowed_values: Optional[List] = None, context: str = "" ) -> None: """ Validate that a numeric parameter is within acceptable range. Parameters ---------- value : Any The value to validate name : str Parameter name for error messages min_val : float, optional Minimum allowed value (inclusive) max_val : float, optional Maximum allowed value (inclusive) allowed_values : list, optional List of specific allowed values context : str, optional Additional context for error message Raises ------ ValueError If the value is not numeric or out of range """ context_str = f" in {context}" if context else "" # Type check try: numeric_value = float(value) except (TypeError, ValueError): raise ValueError(f""" {'='*60} ERROR: Invalid type for {name}{context_str} {'='*60} Expected: numeric value (int or float) Got: {type(value).__name__} = {value} Example: {name}: 1.0 # or {name}: 4 For more examples, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) # Check against allowed values if allowed_values is not None: if isinstance(value, int): if value not in allowed_values: raise ValueError(f""" {'='*60} ERROR: Invalid value for {name}{context_str} {'='*60} Value: {value} Allowed values: {allowed_values} Example: {name}: {allowed_values[0]} For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) else: # For float values, check if close to any allowed value if not any(abs(numeric_value - av) < 1e-10 for av in allowed_values): raise ValueError(f""" {'='*60} ERROR: Invalid value for {name}{context_str} {'='*60} Value: {value} Allowed values: {allowed_values} Example: {name}: {allowed_values[0]} For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) return # Range validation if min_val is not None and numeric_value < min_val: raise ValueError(f""" {'='*60} ERROR: {name} value too small{context_str} {'='*60} Value: {value} Minimum allowed: {min_val} Example: {name}: {min_val if min_val >= 1 else 1.0} For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """) if max_val is not None and numeric_value > max_val: raise ValueError(f""" {'='*60} ERROR: {name} value too large{context_str} {'='*60} Value: {value} Maximum allowed: {max_val} Example: {name}: {max_val if max_val <= 10 else 10.0} For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml {'='*60} """)
[docs]def validate_atom_indices( indices: Union[int, List[int]], atom_count: int, context: str = "", xyz_path: str = "" ) -> None: """ Validate that atom indices are within structure bounds. Parameters ---------- indices : int or list of int Atom index or list of atom indices to validate (0-indexed) atom_count : int Total number of atoms in the structure context : str, optional Description of where indices are used (for error message) xyz_path : str, optional Path to XYZ file for reference in error message Raises ------ ValueError If any index is out of bounds """ # Convert single index to list if isinstance(indices, int): indices = [indices] # Find invalid indices invalid = [idx for idx in indices if idx >= atom_count or idx < 0] if invalid: context_str = f" in {context}" if context else "" xyz_str = f"\nStructure: {xyz_path}" if xyz_path else "" raise ValueError(f""" {'='*60} ERROR: Atom index out of bounds{context_str} {'='*60}{xyz_str} Total atoms: {atom_count} (valid indices: 0 to {atom_count-1}) Invalid indices: {invalid} Note: pyEF uses 0-based indexing (first atom = index 0) To find the correct atom index: 1. Open the XYZ file in a text editor 2. Count atoms starting from 0 (first atom = index 0) 3. The Nth atom has index N-1 Example for a 50-atom system: esp_atom_idx: [25] # Valid (< 50) substrate_idxs: [0, 1, 2] # Valid # atom index 50 would be INVALID (>= 50) For more help, see: - /home/gridsan/mmanetsch/pyEF/pyef/ExampleUsage.py - README.md: Section 2.2 (Input File Formats) {'='*60} """)
[docs]def check_index_overlap( indices1: List[int], indices2: List[int], name1: str = "first set", name2: str = "second set", context: str = "" ) -> None: """ Check if two sets of indices have any overlap. Parameters ---------- indices1 : list of int First set of atom indices indices2 : list of int Second set of atom indices name1 : str Name of first set for error message name2 : str Name of second set for error message context : str, optional Additional context for error message Raises ------ ValueError If there is any overlap between the sets """ overlap = set(indices1) & set(indices2) if overlap: context_str = f" ({context})" if context else "" raise ValueError(f""" {'='*60} ERROR: Overlapping atom indices{context_str} {'='*60} {name1}: {sorted(indices1)} {name2}: {sorted(indices2)} Overlapping indices: {sorted(overlap)} The same atoms cannot be in both sets. This would lead to incorrect results. To fix: - Ensure the two sets are mutually exclusive - Review your system partitioning Example: substrate_idxs: [0, 1, 2] # Active site atoms env_idxs: [10, 11, 12, 13] # Environment (no overlap) For more information, see: - /home/gridsan/mmanetsch/pyEF/pyef/ExampleUsage.py - README.md: Section 3 (Electrostatic Stabilization) {'='*60} """)