Spaces:

Cassius1Morbant
/

French_Legal_Chatbot

Sleeping

File size: 11,532 Bytes

4f3563e

# kbis_full_parser_updated.py — Complete K-bis parser with improved address handling
import re


import re

def parse_full_kbis(input_file: str = "kbis_extracted.txt", output_file: str = "kbis_full_analysis.txt") -> dict:
    """
    Full K-bis parser with enhanced handling for the établissement principal section.
    """
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            full_text = f.read()
    except FileNotFoundError:
        print(f"Error: '{input_file}' not found.")
        return {"Erreur": "Fichier manquant"}

    text = re.sub(r"\s+", " ", full_text).strip()

    result = {}

    # SIREN fallback
    siren_match = re.search(r"SIREN", text, re.I)
    if siren_match:
        post_text = text[siren_match.end():]
        digits = re.findall(r'\d', post_text)
        if len(digits) >= 9:
            result["SIREN"] = ''.join(digits[:9])
        else:
            result["SIREN"] = "Non détecté"
    else:
        result["SIREN"] = "Non détecté"

    # Detect parsing mode
    if re.search(r"SIREN\s*\d{3}", text, re.I):
        # Inline mode: Use sequential label positions
        labels = [
            "SIREN",
            "Date d'immatriculation",
            "Dénomination",
            "Forme juridique",
            "Capital",
            "Adresse",
            "Activités principales",
            "Durée de la personne morale",
            "Date de clôture de l'exercice social"
        ]

        positions = {}
        for label in labels:
            if label == "Capital":
                match = re.search(r"Capital\s*(social)?", text, re.I)
            else:
                match = re.search(re.escape(label), text, re.I)
            if match:
                positions[label] = match.start()

        sorted_labels = sorted(positions, key=positions.get)

        fields = {}
        for i in range(len(sorted_labels) - 1):
            start_label = sorted_labels[i]
            end_label = sorted_labels[i + 1]
            value_start = positions[start_label] + len(start_label)
            if start_label == "Capital":
                social_match = re.search(r"Capital\s*social", text[positions["Capital"]:], re.I)
                if social_match:
                    value_start = positions["Capital"] + social_match.end()
                else:
                    value_start = positions["Capital"] + len("Capital")
            value_end = positions[end_label]
            value = text[value_start:value_end].strip()
            fields[start_label] = value

        result["Date d'immatriculation"] = fields.get("Date d'immatriculation", "Non détecté")
        result["Dénomination"] = fields.get("Dénomination", "Non détecté")
        result["Forme juridique"] = fields.get("Forme juridique", "Non détecté")

        capital_value = fields.get("Capital", "Non détecté")
        capital_match = re.search(r"([\d., ]+)\s*Euros?", capital_value, re.I)
        result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €" if capital_match else "Non détecté"

        result["Adresse"] = fields.get("Adresse", "Non détecté")
        result["Objet social"] = fields.get("Activités principales", "Non détecté").rstrip(".")
        result["Durée"] = fields.get("Durée de la personne morale", "Non détecté")

    else:
        # Separated mode: Labels on one line, values on another
        lines = [l.strip() for l in full_text.split("\n") if l.strip()]
        values_line = None
        for i in range(len(lines) - 1):
            if re.search(r"SIREN Date d'immatriculation", lines[i], re.I):
                values_line = lines[i + 1]
                break

        if values_line:
            value_text = values_line.strip()

            # Anchor on Capital for split
            capital_match = re.search(r"([\d ,.]+)\s*Euros?", value_text, re.I)
            if capital_match:
                result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €"
                before = value_text[:capital_match.start()].strip()
                after = value_text[capital_match.end():].strip()

                # Parse before Capital
                temp_text = before
                before_patterns = [
                    ("SIREN", r"(\d{3} \d{3} \d{3})"),
                    ("Date d'immatriculation", r"(\d{4}-\d{2}-\d{2})"),
                    ("Dénomination", r"([A-Za-z0-9 ]+)"),
                    ("Forme juridique", r"([A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç ]+)"),
                ]
                for key, pattern in before_patterns:
                    match = re.match(pattern, temp_text, re.I)
                    if match:
                        result[key] = match.group(1).strip()
                        temp_text = temp_text[match.end():].strip()
                    else:
                        result[key] = "Non détecté"

                # Post-process for OCR error in "Société"
                deno = result.get("Dénomination", "")
                forme = result.get("Forme juridique", "")
                if deno.endswith("Soci") and forme.startswith("été"):
                    result["Dénomination"] = deno[:-4].strip()
                    result["Forme juridique"] = "Société " + forme[3:].lstrip()

                # Parse after Capital using elimination between Adresse and Durée
                adresse_pattern = r"(\d+\s*[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+\s*\d{5}\s*[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+?)(?=\s[A-Z]|$)"
                adresse_match = re.match(adresse_pattern, after, re.I)
                if adresse_match:
                    result["Adresse"] = adresse_match.group(1).strip()
                    temp_text = after[adresse_match.end():].strip()
                else:
                    result["Adresse"] = "Non détecté"
                    temp_text = after

                # Now find Durée in temp_text
                duree_pattern = r"(\d+ ans à partir du \d{4}-\d{2}-\d{2})"
                duree_match = re.search(duree_pattern, temp_text, re.I)
                if duree_match:
                    result["Durée"] = duree_match.group(1)
                    activites = temp_text[:duree_match.start()].strip().rstrip(".")
                    result["Objet social"] = activites if activites else "Non détecté"
                    date_cloture = temp_text[duree_match.end():].strip()
                    result["Date de clôture de l'exercice social"] = date_cloture if date_cloture else "Non détecté"
                else:
                    result["Durée"] = "Non détecté"
                    result["Objet social"] = "Non détecté"
                    result["Date de clôture de l'exercice social"] = "Non détecté"

            # Clean SIREN if extracted
            if result.get("SIREN") != "Non détecté":
                result["SIREN"] = result["SIREN"].replace(" ", "")

    # Date de début d'activité
    date_debut_match = re.search(r"Date début d['’]activité\s+(\d{4}-\d{2}-\d{2})", text)
    result["Date de début d'activité"] = date_debut_match.group(1) if date_debut_match else "Non détecté"

    # Type d'exploitation with improved delimiter
    type_exp_match = re.search(r"Type d['’]exploitation\s+([A-Za-zé\s]+)(?=\s[A-Z][a-z]+\s[A-Z]|\Z)", text, re.I)
    result["Type d'exploitation"] = type_exp_match.group(1).strip() if type_exp_match else "Non détecté"

    # Persons section (gérants & associés) — unchanged robust logic
    persons_match = re.search(
        r"Gestion, Direction, Administration, Contrôle, Associés ou Membres\s+(.*?)\s+Renseignements sur l['’]établissement principal",
        full_text,
        re.I | re.DOTALL
    )
    persons_text = persons_match.group(1).strip() if persons_match else ""

    gerants = []
    associes = []

    if persons_text:
        compact = re.finditer(
            r"Qualité\s+(gérant|Associé)\s+Nom[,;\s]*prénoms?\s*[:;,]?\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+?)(?:\s+Date et lieu|\s+Nationalité|\s+Adresse|\s+Qualité|$)",
            persons_text,
            re.I
        )
        for m in compact:
            role = m.group(1).lower()
            name = m.group(2).strip().rstrip(";,.")
            (gerants if "gérant" in role else associes).append(name)

        lines = [l.strip() for l in persons_text.split("\n") if l.strip()]
        i = 0
        while i < len(lines):
            if re.search(r"Qualité\s+Nom[;,]?\s*prénoms?\s+Date et lieu de naissance\s+Nationalité\s+Adresse", lines[i], re.I):
                i += 1
                if i < len(lines):
                    match = re.match(
                        r"(gérant|Associé)\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+)",
                        lines[i],
                        re.I
                    )
                    if match:
                        role = match.group(1).lower()
                        name = match.group(2).strip().rstrip(";,.")
                        (gerants if "gérant" in role else associes).append(name)
            i += 1

    result["Gérant(s)"] = ", ".join(dict.fromkeys(gerants)) if gerants else "Non détecté"
    result["Associé(s)"] = ", ".join(dict.fromkeys(associes)) if associes else "Non détecté"

    # ──────────────────────────────────────────────────────────────
    # Robust parsing of "Renseignements sur l'établissement principal"
    # ──────────────────────────────────────────────────────────────
    etab_section = re.search(
        r"Renseignements\s+sur\s+l['’]établissement\s+principal\s+(.*?)(?=Bulletin\s+Officiel|$)",
        full_text,
        re.IGNORECASE | re.DOTALL
    )

    if etab_section:
        block = etab_section.group(1)

        # 1. Adresse établissement principal
        addr_match = re.search(r"Adresse\s+([^\n\r]+?)(?=\s+Date\s+début|\s+Type\s+d['’]exploitation|$)", block,
                               re.IGNORECASE)
        if addr_match:
            adresse_etab = addr_match.group(1).strip()
            # Remove any accidental trailing punctuation or leftover words
            adresse_etab = re.sub(r"\s+[A-Z][a-z]+$", "", adresse_etab).strip()
            result["Adresse établissement"] = adresse_etab
        else:
            result["Adresse établissement"] = "Non détecté"

        # 2. Type d'exploitation (scoped only to this block to avoid BODACC spillover)
        type_match = re.search(r"Type\s+d['’]exploitation\s+([^\n\r]+)", block, re.IGNORECASE)
        if type_match:
            result["Type d'exploitation"] = type_match.group(1).strip()
        else:
            result["Type d'exploitation"] = "Non détecté"
    else:
        result["Adresse établissement"] = "Non détecté"
        result["Type d'exploitation"] = "Non détecté"

    # Final output
    output = "K-BIS ANALYSE COMPLÈTE\n\n"
    for key, value in result.items():
        output += f"{key}: {value}\n"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(output)

    print("=== K-BIS ANALYSE COMPLÈTE ===")
    print(output)

    return result
if __name__ == "__main__":
    parse_full_kbis("kbis_extracted.txt")