chemical_equation_balancer/parser.py

import re
from validity_check import check_atom


def parse_atom(atom):
    """
    将化学式中的原子转化为字典
    :param atom: 原子的字符串表示
    :return: 原子的字典表示，{'元素名称': 元素个数}
    """
    if len(atom) == 1:
        check_atom(atom)
        return {atom: 1}
    elif len(atom) == 2:
        if atom[1].isdigit():
            check_atom(atom[0])
            return {atom[0]: int(atom[1])}
        else:
            check_atom(atom)
            return {atom: 1}
    else:
        check_atom(atom[:-1])
        return {atom[:-1]: int(atom[-1])}


def combine_same_atom(parsed_molecule):
    """
    合并同类原子，类型不变，数量相加
    :param parsed_molecule: 化学式的字典表示
    :return: 合并完成后化学式的字典表示
    """
    atoms = parsed_molecule['atoms']
    new_atoms = []
    for atom in atoms:
        for new_atom in new_atoms:
            if list(atom.keys())[0] == list(new_atom.keys())[0]:
                new_atom[list(atom.keys())[0]] += atom[list(atom.keys())[0]]
                break
        else:
            new_atoms.append(atom)
    return {
        'atoms': new_atoms,
        'coefficient': parsed_molecule['coefficient'],
        'pretty_name': parsed_molecule['pretty_name']
    }


def parse_atomic_clusters(atomic_clusters):
    """
    将原子团转化为字典
    :param atomic_clusters: 经过parse_molecule处理后的原子团
    如(ClO)2->['(', 'Cl', 'O', ')2']
    :return: 原子团的字典表示
    """
    # 去除首括号
    atomic_clusters = atomic_clusters[1:]
    # 去除尾括号，解析尾括号后的数值作为原子团系数
    if atomic_clusters[-1][-1].isdigit():
        coefficient = int(atomic_clusters[-1][-1])
        atomic_clusters = atomic_clusters[:-1]
    elif atomic_clusters[-1] == ')':
        coefficient = 1
    else:
        raise ValueError('无效的原子团系数')
    # 解析原子团
    atoms = []
    for atom in atomic_clusters:
        atoms.append(parse_atom(atom))
    return {
        'atoms': atoms,
        'coefficient': coefficient,
        'pretty_name': ''.join(atomic_clusters)
    }


def parse_molecule(molecule):
    """
    将化学式转化为字典，以大小写区分不同元素（元素的第一个字母大写）
    注意一个化学式中可能包含多个元素
    应当得到一个字典：
    {
        'atoms': [ {'元素名称': 元素个数}, {'元素名称': 元素个数}, ... ],
        'coefficient': 系数,
        'pretty_name': 化学式的字符串表示
    }
    :param molecule: 化学式
    :return: 化学式的字典表示
    """
    pretty_name = ''
    if molecule[0].isdigit():
        coefficient = int(molecule[0])
        pretty_name = molecule = molecule[1:]
    else:
        coefficient = 1
        pretty_name = molecule
    # 以大写字母为分隔符，分割化学式
    molecule = re.split(r'([A-Z][a-z]*)', molecule)
    molecule = [i for i in molecule if i != '']
    # 将原子团提取出来单独处理
    atomic_clusters = []
    for i in range(len(molecule)):
        if molecule[i] == '(':
            j = i + 1
            while molecule[j][0] != ')':
                j += 1
                if j == len(molecule):
                    raise ValueError('括号不匹配')
            for k in range(i, j + 1):
                atomic_clusters.append(molecule[k])
            for k in range(i, j + 1):
                molecule[k] = ''
        # 如果出现单个数字，说明该数字是系数，将其追加到上一个原子/原子团（非空字符串）的后面
        if molecule[i].isdigit():
            while molecule[i - 1] == '':
                i -= 1
                if i == 0:
                    raise ValueError('系数错误')
            molecule[i - 1] += molecule[i]
            molecule[i] = ''
    molecule = [i for i in molecule if i != '']
    atoms = []
    for i in molecule:
        atoms.append(parse_atom(i))
    # 解析原子团
    if len(atomic_clusters) != 0:
        parsed_atomic_clusters = parse_atomic_clusters(atomic_clusters)
        for each in parsed_atomic_clusters['atoms']:
            quantity = each[list(each.keys())[0]] * parsed_atomic_clusters['coefficient']
            each[list(each.keys())[0]] = quantity
            atoms.append(each)
    parsed_atom = {
        'atoms': atoms,
        'coefficient': coefficient,
        'pretty_name': pretty_name
    }
    return combine_same_atom(parsed_atom)


def parse_equation(eq):
    """
    将化学方程式转化为字典
    :param eq: 化学方程式
    :return: 化学方程式的字典表示
    """
    eq = eq.replace(' ', '')
    eq = eq.replace('->', '=')
    eq = eq.replace('=', '=>')
    eq = eq.split('=>')
    left = eq[0]
    right = eq[1]
    left = left.split('+')
    right = right.split('+')
    left = [parse_molecule(molecule) for molecule in left]
    right = [parse_molecule(molecule) for molecule in right]
    return {
        'left': left,
        'right': right
    }