|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
import re |
|
|
|
|
|
def parse_full_kbis(input_file: str = "kbis_extracted.txt", output_file: str = "kbis_full_analysis.txt") -> dict: |
|
|
""" |
|
|
Full K-bis parser with enhanced handling for the établissement principal section. |
|
|
""" |
|
|
try: |
|
|
with open(input_file, "r", encoding="utf-8") as f: |
|
|
full_text = f.read() |
|
|
except FileNotFoundError: |
|
|
print(f"Error: '{input_file}' not found.") |
|
|
return {"Erreur": "Fichier manquant"} |
|
|
|
|
|
text = re.sub(r"\s+", " ", full_text).strip() |
|
|
|
|
|
result = {} |
|
|
|
|
|
|
|
|
siren_match = re.search(r"SIREN", text, re.I) |
|
|
if siren_match: |
|
|
post_text = text[siren_match.end():] |
|
|
digits = re.findall(r'\d', post_text) |
|
|
if len(digits) >= 9: |
|
|
result["SIREN"] = ''.join(digits[:9]) |
|
|
else: |
|
|
result["SIREN"] = "Non détecté" |
|
|
else: |
|
|
result["SIREN"] = "Non détecté" |
|
|
|
|
|
|
|
|
if re.search(r"SIREN\s*\d{3}", text, re.I): |
|
|
|
|
|
labels = [ |
|
|
"SIREN", |
|
|
"Date d'immatriculation", |
|
|
"Dénomination", |
|
|
"Forme juridique", |
|
|
"Capital", |
|
|
"Adresse", |
|
|
"Activités principales", |
|
|
"Durée de la personne morale", |
|
|
"Date de clôture de l'exercice social" |
|
|
] |
|
|
|
|
|
positions = {} |
|
|
for label in labels: |
|
|
if label == "Capital": |
|
|
match = re.search(r"Capital\s*(social)?", text, re.I) |
|
|
else: |
|
|
match = re.search(re.escape(label), text, re.I) |
|
|
if match: |
|
|
positions[label] = match.start() |
|
|
|
|
|
sorted_labels = sorted(positions, key=positions.get) |
|
|
|
|
|
fields = {} |
|
|
for i in range(len(sorted_labels) - 1): |
|
|
start_label = sorted_labels[i] |
|
|
end_label = sorted_labels[i + 1] |
|
|
value_start = positions[start_label] + len(start_label) |
|
|
if start_label == "Capital": |
|
|
social_match = re.search(r"Capital\s*social", text[positions["Capital"]:], re.I) |
|
|
if social_match: |
|
|
value_start = positions["Capital"] + social_match.end() |
|
|
else: |
|
|
value_start = positions["Capital"] + len("Capital") |
|
|
value_end = positions[end_label] |
|
|
value = text[value_start:value_end].strip() |
|
|
fields[start_label] = value |
|
|
|
|
|
result["Date d'immatriculation"] = fields.get("Date d'immatriculation", "Non détecté") |
|
|
result["Dénomination"] = fields.get("Dénomination", "Non détecté") |
|
|
result["Forme juridique"] = fields.get("Forme juridique", "Non détecté") |
|
|
|
|
|
capital_value = fields.get("Capital", "Non détecté") |
|
|
capital_match = re.search(r"([\d., ]+)\s*Euros?", capital_value, re.I) |
|
|
result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €" if capital_match else "Non détecté" |
|
|
|
|
|
result["Adresse"] = fields.get("Adresse", "Non détecté") |
|
|
result["Objet social"] = fields.get("Activités principales", "Non détecté").rstrip(".") |
|
|
result["Durée"] = fields.get("Durée de la personne morale", "Non détecté") |
|
|
|
|
|
else: |
|
|
|
|
|
lines = [l.strip() for l in full_text.split("\n") if l.strip()] |
|
|
values_line = None |
|
|
for i in range(len(lines) - 1): |
|
|
if re.search(r"SIREN Date d'immatriculation", lines[i], re.I): |
|
|
values_line = lines[i + 1] |
|
|
break |
|
|
|
|
|
if values_line: |
|
|
value_text = values_line.strip() |
|
|
|
|
|
|
|
|
capital_match = re.search(r"([\d ,.]+)\s*Euros?", value_text, re.I) |
|
|
if capital_match: |
|
|
result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €" |
|
|
before = value_text[:capital_match.start()].strip() |
|
|
after = value_text[capital_match.end():].strip() |
|
|
|
|
|
|
|
|
temp_text = before |
|
|
before_patterns = [ |
|
|
("SIREN", r"(\d{3} \d{3} \d{3})"), |
|
|
("Date d'immatriculation", r"(\d{4}-\d{2}-\d{2})"), |
|
|
("Dénomination", r"([A-Za-z0-9 ]+)"), |
|
|
("Forme juridique", r"([A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç ]+)"), |
|
|
] |
|
|
for key, pattern in before_patterns: |
|
|
match = re.match(pattern, temp_text, re.I) |
|
|
if match: |
|
|
result[key] = match.group(1).strip() |
|
|
temp_text = temp_text[match.end():].strip() |
|
|
else: |
|
|
result[key] = "Non détecté" |
|
|
|
|
|
|
|
|
deno = result.get("Dénomination", "") |
|
|
forme = result.get("Forme juridique", "") |
|
|
if deno.endswith("Soci") and forme.startswith("été"): |
|
|
result["Dénomination"] = deno[:-4].strip() |
|
|
result["Forme juridique"] = "Société " + forme[3:].lstrip() |
|
|
|
|
|
|
|
|
adresse_pattern = r"(\d+\s*[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+\s*\d{5}\s*[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+?)(?=\s[A-Z]|$)" |
|
|
adresse_match = re.match(adresse_pattern, after, re.I) |
|
|
if adresse_match: |
|
|
result["Adresse"] = adresse_match.group(1).strip() |
|
|
temp_text = after[adresse_match.end():].strip() |
|
|
else: |
|
|
result["Adresse"] = "Non détecté" |
|
|
temp_text = after |
|
|
|
|
|
|
|
|
duree_pattern = r"(\d+ ans à partir du \d{4}-\d{2}-\d{2})" |
|
|
duree_match = re.search(duree_pattern, temp_text, re.I) |
|
|
if duree_match: |
|
|
result["Durée"] = duree_match.group(1) |
|
|
activites = temp_text[:duree_match.start()].strip().rstrip(".") |
|
|
result["Objet social"] = activites if activites else "Non détecté" |
|
|
date_cloture = temp_text[duree_match.end():].strip() |
|
|
result["Date de clôture de l'exercice social"] = date_cloture if date_cloture else "Non détecté" |
|
|
else: |
|
|
result["Durée"] = "Non détecté" |
|
|
result["Objet social"] = "Non détecté" |
|
|
result["Date de clôture de l'exercice social"] = "Non détecté" |
|
|
|
|
|
|
|
|
if result.get("SIREN") != "Non détecté": |
|
|
result["SIREN"] = result["SIREN"].replace(" ", "") |
|
|
|
|
|
|
|
|
date_debut_match = re.search(r"Date début d['’]activité\s+(\d{4}-\d{2}-\d{2})", text) |
|
|
result["Date de début d'activité"] = date_debut_match.group(1) if date_debut_match else "Non détecté" |
|
|
|
|
|
|
|
|
type_exp_match = re.search(r"Type d['’]exploitation\s+([A-Za-zé\s]+)(?=\s[A-Z][a-z]+\s[A-Z]|\Z)", text, re.I) |
|
|
result["Type d'exploitation"] = type_exp_match.group(1).strip() if type_exp_match else "Non détecté" |
|
|
|
|
|
|
|
|
persons_match = re.search( |
|
|
r"Gestion, Direction, Administration, Contrôle, Associés ou Membres\s+(.*?)\s+Renseignements sur l['’]établissement principal", |
|
|
full_text, |
|
|
re.I | re.DOTALL |
|
|
) |
|
|
persons_text = persons_match.group(1).strip() if persons_match else "" |
|
|
|
|
|
gerants = [] |
|
|
associes = [] |
|
|
|
|
|
if persons_text: |
|
|
compact = re.finditer( |
|
|
r"Qualité\s+(gérant|Associé)\s+Nom[,;\s]*prénoms?\s*[:;,]?\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+?)(?:\s+Date et lieu|\s+Nationalité|\s+Adresse|\s+Qualité|$)", |
|
|
persons_text, |
|
|
re.I |
|
|
) |
|
|
for m in compact: |
|
|
role = m.group(1).lower() |
|
|
name = m.group(2).strip().rstrip(";,.") |
|
|
(gerants if "gérant" in role else associes).append(name) |
|
|
|
|
|
lines = [l.strip() for l in persons_text.split("\n") if l.strip()] |
|
|
i = 0 |
|
|
while i < len(lines): |
|
|
if re.search(r"Qualité\s+Nom[;,]?\s*prénoms?\s+Date et lieu de naissance\s+Nationalité\s+Adresse", lines[i], re.I): |
|
|
i += 1 |
|
|
if i < len(lines): |
|
|
match = re.match( |
|
|
r"(gérant|Associé)\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+)", |
|
|
lines[i], |
|
|
re.I |
|
|
) |
|
|
if match: |
|
|
role = match.group(1).lower() |
|
|
name = match.group(2).strip().rstrip(";,.") |
|
|
(gerants if "gérant" in role else associes).append(name) |
|
|
i += 1 |
|
|
|
|
|
result["Gérant(s)"] = ", ".join(dict.fromkeys(gerants)) if gerants else "Non détecté" |
|
|
result["Associé(s)"] = ", ".join(dict.fromkeys(associes)) if associes else "Non détecté" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
etab_section = re.search( |
|
|
r"Renseignements\s+sur\s+l['’]établissement\s+principal\s+(.*?)(?=Bulletin\s+Officiel|$)", |
|
|
full_text, |
|
|
re.IGNORECASE | re.DOTALL |
|
|
) |
|
|
|
|
|
if etab_section: |
|
|
block = etab_section.group(1) |
|
|
|
|
|
|
|
|
addr_match = re.search(r"Adresse\s+([^\n\r]+?)(?=\s+Date\s+début|\s+Type\s+d['’]exploitation|$)", block, |
|
|
re.IGNORECASE) |
|
|
if addr_match: |
|
|
adresse_etab = addr_match.group(1).strip() |
|
|
|
|
|
adresse_etab = re.sub(r"\s+[A-Z][a-z]+$", "", adresse_etab).strip() |
|
|
result["Adresse établissement"] = adresse_etab |
|
|
else: |
|
|
result["Adresse établissement"] = "Non détecté" |
|
|
|
|
|
|
|
|
type_match = re.search(r"Type\s+d['’]exploitation\s+([^\n\r]+)", block, re.IGNORECASE) |
|
|
if type_match: |
|
|
result["Type d'exploitation"] = type_match.group(1).strip() |
|
|
else: |
|
|
result["Type d'exploitation"] = "Non détecté" |
|
|
else: |
|
|
result["Adresse établissement"] = "Non détecté" |
|
|
result["Type d'exploitation"] = "Non détecté" |
|
|
|
|
|
|
|
|
output = "K-BIS ANALYSE COMPLÈTE\n\n" |
|
|
for key, value in result.items(): |
|
|
output += f"{key}: {value}\n" |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
|
f.write(output) |
|
|
|
|
|
print("=== K-BIS ANALYSE COMPLÈTE ===") |
|
|
print(output) |
|
|
|
|
|
return result |
|
|
if __name__ == "__main__": |
|
|
parse_full_kbis("kbis_extracted.txt") |