Source code for pyef.validation

"""
Validation utilities for pyEF.

This module provides centralized validation functions for input parameters,
file paths, and data formats. All validation functions raise ValueError with
helpful error messages that include examples and references to documentation.
"""

import os
from typing import Union, List, Optional, Any


# Valid charge partitioning schemes supported by pyEF
VALID_CHARGE_TYPES = {
    'Hirshfeld', 'Hirshfeld_I', 'Voronoi', 'Mulliken', 'Lowdin',
    'SCPA', 'Becke', 'ADCH', 'CHELPG', 'MK', 'AIM',
    'CM5', 'EEM', 'RESP', 'PEOE'
}

# Valid charge types for multipole analysis (only these work with Multiwfn multipole mode)
VALID_MULTIPOLE_CHARGE_TYPES = {
    'Hirshfeld', 'Hirshfeld_I', 'Becke'
}


[docs]def get_atom_count_from_xyz(xyz_path: str) -> int:
    """
    Parse the number of atoms from an XYZ file.

    Parameters
    ----------
    xyz_path : str
        Path to the XYZ file

    Returns
    -------
    int
        Number of atoms in the structure

    Raises
    ------
    ValueError
        If the file doesn't exist or has invalid format
    """
    if not os.path.exists(xyz_path):
        raise ValueError(f"""
{'='*60}
ERROR: XYZ file not found
{'='*60}
Path: {xyz_path}

The file does not exist. Please check the path.

For help with file paths, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.2 (Input File Formats)
{'='*60}
""")

    try:
        with open(xyz_path, 'r') as f:
            first_line = f.readline().strip()
            atom_count = int(first_line)
            return atom_count
    except (ValueError, IOError) as e:
        raise ValueError(f"""
{'='*60}
ERROR: Invalid XYZ file format
{'='*60}
File: {xyz_path}
Issue: {str(e)}

XYZ format requires:
- First line: number of atoms (integer)
- Second line: comment
- Following lines: element x y z

Example XYZ file:
  3
  Water molecule
  O  0.000  0.000  0.000
  H  0.757  0.586  0.000
  H -0.757  0.586  0.000

For more information, see:
- README.md: Section 2.2 (Input File Formats)
{'='*60}
""")


[docs]def validate_charge_type(charge_type: str, context: str = "") -> None:
    """
    Validate that a charge type is in the set of supported schemes.

    Parameters
    ----------
    charge_type : str
        The charge partitioning scheme to validate
    context : str, optional
        Additional context for the error message

    Raises
    ------
    ValueError
        If the charge type is not valid
    """
    if charge_type not in VALID_CHARGE_TYPES:
        context_str = f" in {context}" if context else ""

        # Find similar charge types (case-insensitive match)
        similar = [ct for ct in VALID_CHARGE_TYPES if ct.lower() == charge_type.lower()]
        suggestion = f"\nDid you mean: '{similar[0]}'? (Note: charge types are case-sensitive)" if similar else ""

        raise ValueError(f"""
{'='*60}
ERROR: Invalid charge partitioning scheme{context_str}
{'='*60}
Specified: '{charge_type}'

Valid charge types (case-sensitive):
  - Hirshfeld       (fast, good for most systems)
  - Hirshfeld_I     (most accurate, slower)
  - CHELPG          (for ESP fitting)
  - Becke           (fast, less accurate)
  - Mulliken, Lowdin, SCPA, ADCH, Voronoi
  - MK, AIM, CM5, EEM, RESP, PEOE{suggestion}

Example usage in config.yaml:
  charge_types: ['Hirshfeld_I']

Example usage in Python:
  esp_df = estat.getESP(['Hirshfeld_I'], 'output', multiwfn_path)

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2.3 (Key Parameters)
{'='*60}
""")


[docs]def filter_charge_types_for_multipole(charge_types: Union[str, List[str]],
                                       context: str = "") -> List[str]:
    """
    Filter charge types to only those that work with multipole analysis.

    Only Hirshfeld, Hirshfeld_I, and Becke are supported for multipole analysis.
    This function validates the input, warns about unsupported types, and returns
    only the valid ones.

    Parameters
    ----------
    charge_types : str or list of str
        Charge partitioning scheme(s) to validate
    context : str, optional
        Additional context for warning/error messages

    Returns
    -------
    list of str
        Filtered list containing only valid multipole charge types.
        Returns empty list if no valid types are found.

    Warnings
    --------
    Prints warnings for any charge types that don't support multipole analysis
    """
    # Convert to list if string
    if isinstance(charge_types, str):
        charge_types = [charge_types]

    context_str = f" in {context}" if context else ""

    # First validate that all charge types are generally valid
    for charge_type in charge_types:
        validate_charge_type(charge_type, context=context)

    # Filter for multipole-compatible types
    valid_for_multipole = []
    invalid_for_multipole = []

    for charge_type in charge_types:
        if charge_type in VALID_MULTIPOLE_CHARGE_TYPES:
            valid_for_multipole.append(charge_type)
        else:
            invalid_for_multipole.append(charge_type)

    # Warn about invalid types
    if invalid_for_multipole:
        print(f"""
{'='*60}
WARNING: Invalid charge type(s) for multipole analysis{context_str}
{'='*60}
The following charge types do NOT support multipole analysis:
  {', '.join(invalid_for_multipole)}

Only these charge types work with multipole analysis:
  Hirshfeld, Hirshfeld_I, Becke

These charge types will be SKIPPED for multipole calculations.
""")

        if valid_for_multipole:
            print(f"Valid charge types that will be used: {', '.join(valid_for_multipole)}")
            print(f"{'='*60}\n")

    # If no valid types remain, just return empty list (caller will handle)
    if not valid_for_multipole:
        print(f"No valid charge types remaining for multipole analysis{context_str}. Skipping.")
        print(f"{'='*60}\n")

    return valid_for_multipole


[docs]def filter_charge_types_for_monopole(charge_types: Union[str, List[str]],
                                      context: str = "") -> List[str]:
    """
    Filter charge types to only those that work with monopole analysis.

    All charge types in VALID_CHARGE_TYPES are supported for monopole analysis.
    This function validates the input and returns only valid types.

    Parameters
    ----------
    charge_types : str or list of str
        Charge partitioning scheme(s) to validate
    context : str, optional
        Additional context for warning/error messages

    Returns
    -------
    list of str
        Filtered list containing only valid monopole charge types.
        Returns empty list if no valid types are found.

    Warnings
    --------
    Prints warnings for any invalid charge types
    """
    # Convert to list if string
    if isinstance(charge_types, str):
        charge_types = [charge_types]

    context_str = f" in {context}" if context else ""

    # Filter for valid monopole types
    valid_for_monopole = []
    invalid_for_monopole = []

    for charge_type in charge_types:
        if charge_type in VALID_CHARGE_TYPES:
            valid_for_monopole.append(charge_type)
        else:
            invalid_for_monopole.append(charge_type)

    # Warn about invalid types
    if invalid_for_monopole:
        print(f"""
{'='*60}
WARNING: Invalid charge type(s) for monopole analysis{context_str}
{'='*60}
The following charge types are NOT valid:
  {', '.join(invalid_for_monopole)}

Valid charge types for monopole analysis:
  Hirshfeld, Hirshfeld_I, Voronoi, Mulliken, Lowdin,
  SCPA, Becke, ADCH, CHELPG, MK, AIM, CM5, EEM, RESP, PEOE

These charge types will be SKIPPED.
""")

        if valid_for_monopole:
            print(f"Valid charge types that will be used: {', '.join(valid_for_monopole)}")
            print(f"{'='*60}\n")

    # If no valid types remain, just return empty list (caller will handle)
    if not valid_for_monopole:
        print(f"No valid charge types remaining for monopole analysis{context_str}. Skipping.")
        print(f"{'='*60}\n")

    return valid_for_monopole


[docs]def check_path_exists(path: str, path_type: str = "file", context: str = "") -> None:
    """
    Check if a file or directory path exists.

    Parameters
    ----------
    path : str
        Path to check
    path_type : str
        Either "file" or "directory"
    context : str, optional
        Additional context for the error message

    Raises
    ------
    ValueError
        If the path does not exist
    """
    if not os.path.exists(path):
        context_str = f" ({context})" if context else ""

        if path_type == "file":
            raise ValueError(f"""
{'='*60}
ERROR: File not found{context_str}
{'='*60}
Path: {path}

The file does not exist. Please check:
1. The path is correct
2. The file exists in the specified location
3. You have read permissions for the file

For help with file paths, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
- README.md: Section 2 (Quick Start Guide)
{'='*60}
""")
        else:
            raise ValueError(f"""
{'='*60}
ERROR: Directory not found{context_str}
{'='*60}
Path: {path}

The directory does not exist. Please check:
1. The path is correct
2. The directory exists
3. You have read permissions

For help, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

    # Additional check for executability if it's the Multiwfn path
    if context.lower() == "multiwfn_path" and path_type == "file":
        if not os.access(path, os.X_OK):
            raise ValueError(f"""
{'='*60}
ERROR: Multiwfn executable is not executable
{'='*60}
Path: {path}

The file exists but is not executable. To fix:
  chmod +x {path}

Multiwfn is required for all pyEF calculations.

For installation instructions, see:
- README.md: Section 4 (Installation)
- https://pyef.readthedocs.io/
{'='*60}
""")


[docs]def validate_numeric_range(
    value: Any,
    name: str,
    min_val: Optional[float] = None,
    max_val: Optional[float] = None,
    allowed_values: Optional[List] = None,
    context: str = ""
) -> None:
    """
    Validate that a numeric parameter is within acceptable range.

    Parameters
    ----------
    value : Any
        The value to validate
    name : str
        Parameter name for error messages
    min_val : float, optional
        Minimum allowed value (inclusive)
    max_val : float, optional
        Maximum allowed value (inclusive)
    allowed_values : list, optional
        List of specific allowed values
    context : str, optional
        Additional context for error message

    Raises
    ------
    ValueError
        If the value is not numeric or out of range
    """
    context_str = f" in {context}" if context else ""

    # Type check
    try:
        numeric_value = float(value)
    except (TypeError, ValueError):
        raise ValueError(f"""
{'='*60}
ERROR: Invalid type for {name}{context_str}
{'='*60}
Expected: numeric value (int or float)
Got: {type(value).__name__} = {value}

Example:
  {name}: 1.0
  # or
  {name}: 4

For more examples, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

    # Check against allowed values
    if allowed_values is not None:
        if isinstance(value, int):
            if value not in allowed_values:
                raise ValueError(f"""
{'='*60}
ERROR: Invalid value for {name}{context_str}
{'='*60}
Value: {value}
Allowed values: {allowed_values}

Example:
  {name}: {allowed_values[0]}

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")
        else:
            # For float values, check if close to any allowed value
            if not any(abs(numeric_value - av) < 1e-10 for av in allowed_values):
                raise ValueError(f"""
{'='*60}
ERROR: Invalid value for {name}{context_str}
{'='*60}
Value: {value}
Allowed values: {allowed_values}

Example:
  {name}: {allowed_values[0]}

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")
        return

    # Range validation
    if min_val is not None and numeric_value < min_val:
        raise ValueError(f"""
{'='*60}
ERROR: {name} value too small{context_str}
{'='*60}
Value: {value}
Minimum allowed: {min_val}

Example:
  {name}: {min_val if min_val >= 1 else 1.0}

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")

    if max_val is not None and numeric_value > max_val:
        raise ValueError(f"""
{'='*60}
ERROR: {name} value too large{context_str}
{'='*60}
Value: {value}
Maximum allowed: {max_val}

Example:
  {name}: {max_val if max_val <= 10 else 10.0}

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/example_config.yaml
{'='*60}
""")


[docs]def validate_atom_indices(
    indices: Union[int, List[int]],
    atom_count: int,
    context: str = "",
    xyz_path: str = ""
) -> None:
    """
    Validate that atom indices are within structure bounds.

    Parameters
    ----------
    indices : int or list of int
        Atom index or list of atom indices to validate (0-indexed)
    atom_count : int
        Total number of atoms in the structure
    context : str, optional
        Description of where indices are used (for error message)
    xyz_path : str, optional
        Path to XYZ file for reference in error message

    Raises
    ------
    ValueError
        If any index is out of bounds
    """
    # Convert single index to list
    if isinstance(indices, int):
        indices = [indices]

    # Find invalid indices
    invalid = [idx for idx in indices if idx >= atom_count or idx < 0]

    if invalid:
        context_str = f" in {context}" if context else ""
        xyz_str = f"\nStructure: {xyz_path}" if xyz_path else ""

        raise ValueError(f"""
{'='*60}
ERROR: Atom index out of bounds{context_str}
{'='*60}{xyz_str}
Total atoms: {atom_count} (valid indices: 0 to {atom_count-1})
Invalid indices: {invalid}

Note: pyEF uses 0-based indexing (first atom = index 0)

To find the correct atom index:
1. Open the XYZ file in a text editor
2. Count atoms starting from 0 (first atom = index 0)
3. The Nth atom has index N-1

Example for a 50-atom system:
  esp_atom_idx: [25]     # Valid (< 50)
  substrate_idxs: [0, 1, 2]  # Valid
  # atom index 50 would be INVALID (>= 50)

For more help, see:
- /home/gridsan/mmanetsch/pyEF/pyef/ExampleUsage.py
- README.md: Section 2.2 (Input File Formats)
{'='*60}
""")


[docs]def check_index_overlap(
    indices1: List[int],
    indices2: List[int],
    name1: str = "first set",
    name2: str = "second set",
    context: str = ""
) -> None:
    """
    Check if two sets of indices have any overlap.

    Parameters
    ----------
    indices1 : list of int
        First set of atom indices
    indices2 : list of int
        Second set of atom indices
    name1 : str
        Name of first set for error message
    name2 : str
        Name of second set for error message
    context : str, optional
        Additional context for error message

    Raises
    ------
    ValueError
        If there is any overlap between the sets
    """
    overlap = set(indices1) & set(indices2)

    if overlap:
        context_str = f" ({context})" if context else ""

        raise ValueError(f"""
{'='*60}
ERROR: Overlapping atom indices{context_str}
{'='*60}
{name1}: {sorted(indices1)}
{name2}: {sorted(indices2)}
Overlapping indices: {sorted(overlap)}

The same atoms cannot be in both sets.
This would lead to incorrect results.

To fix:
- Ensure the two sets are mutually exclusive
- Review your system partitioning

Example:
  substrate_idxs: [0, 1, 2]      # Active site atoms
  env_idxs: [10, 11, 12, 13]     # Environment (no overlap)

For more information, see:
- /home/gridsan/mmanetsch/pyEF/pyef/ExampleUsage.py
- README.md: Section 3 (Electrostatic Stabilization)
{'='*60}
""")