Spaces:

Cassius1Morbant
/

French_Legal_Chatbot

Running

App Files Files Community

French_Legal_Chatbot / parsepers.py

Cassius1Morbant

Upload 185 files

4f3563e verified 18 days ago

raw

history blame contribute delete

11.5 kB

	# kbis_full_parser_updated.py — Complete K-bis parser with improved address handling
	import re


	import re

	def parse_full_kbis(input_file: str = "kbis_extracted.txt", output_file: str = "kbis_full_analysis.txt") -> dict:
	"""
	Full K-bis parser with enhanced handling for the établissement principal section.
	"""
	try:
	with open(input_file, "r", encoding="utf-8") as f:
	full_text = f.read()
	except FileNotFoundError:
	print(f"Error: '{input_file}' not found.")
	return {"Erreur": "Fichier manquant"}

	text = re.sub(r"\s+", " ", full_text).strip()

	result = {}

	# SIREN fallback
	siren_match = re.search(r"SIREN", text, re.I)
	if siren_match:
	post_text = text[siren_match.end():]
	digits = re.findall(r'\d', post_text)
	if len(digits) >= 9:
	result["SIREN"] = ''.join(digits[:9])
	else:
	result["SIREN"] = "Non détecté"
	else:
	result["SIREN"] = "Non détecté"

	# Detect parsing mode
	if re.search(r"SIREN\s*\d{3}", text, re.I):
	# Inline mode: Use sequential label positions
	labels = [
	"SIREN",
	"Date d'immatriculation",
	"Dénomination",
	"Forme juridique",
	"Capital",
	"Adresse",
	"Activités principales",
	"Durée de la personne morale",
	"Date de clôture de l'exercice social"
	]

	positions = {}
	for label in labels:
	if label == "Capital":
	match = re.search(r"Capital\s*(social)?", text, re.I)
	else:
	match = re.search(re.escape(label), text, re.I)
	if match:
	positions[label] = match.start()

	sorted_labels = sorted(positions, key=positions.get)

	fields = {}
	for i in range(len(sorted_labels) - 1):
	start_label = sorted_labels[i]
	end_label = sorted_labels[i + 1]
	value_start = positions[start_label] + len(start_label)
	if start_label == "Capital":
	social_match = re.search(r"Capital\s*social", text[positions["Capital"]:], re.I)
	if social_match:
	value_start = positions["Capital"] + social_match.end()
	else:
	value_start = positions["Capital"] + len("Capital")
	value_end = positions[end_label]
	value = text[value_start:value_end].strip()
	fields[start_label] = value

	result["Date d'immatriculation"] = fields.get("Date d'immatriculation", "Non détecté")
	result["Dénomination"] = fields.get("Dénomination", "Non détecté")
	result["Forme juridique"] = fields.get("Forme juridique", "Non détecté")

	capital_value = fields.get("Capital", "Non détecté")
	capital_match = re.search(r"([\d., ]+)\s*Euros?", capital_value, re.I)
	result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €" if capital_match else "Non détecté"

	result["Adresse"] = fields.get("Adresse", "Non détecté")
	result["Objet social"] = fields.get("Activités principales", "Non détecté").rstrip(".")
	result["Durée"] = fields.get("Durée de la personne morale", "Non détecté")

	else:
	# Separated mode: Labels on one line, values on another
	lines = [l.strip() for l in full_text.split("\n") if l.strip()]
	values_line = None
	for i in range(len(lines) - 1):
	if re.search(r"SIREN Date d'immatriculation", lines[i], re.I):
	values_line = lines[i + 1]
	break

	if values_line:
	value_text = values_line.strip()

	# Anchor on Capital for split
	capital_match = re.search(r"([\d ,.]+)\s*Euros?", value_text, re.I)
	if capital_match:
	result["Capital"] = capital_match.group(1).replace(",", ".").replace(" ", "") + " €"
	before = value_text[:capital_match.start()].strip()
	after = value_text[capital_match.end():].strip()

	# Parse before Capital
	temp_text = before
	before_patterns = [
	("SIREN", r"(\d{3} \d{3} \d{3})"),
	("Date d'immatriculation", r"(\d{4}-\d{2}-\d{2})"),
	("Dénomination", r"([A-Za-z0-9 ]+)"),
	("Forme juridique", r"([A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç ]+)"),
	]
	for key, pattern in before_patterns:
	match = re.match(pattern, temp_text, re.I)
	if match:
	result[key] = match.group(1).strip()
	temp_text = temp_text[match.end():].strip()
	else:
	result[key] = "Non détecté"

	# Post-process for OCR error in "Société"
	deno = result.get("Dénomination", "")
	forme = result.get("Forme juridique", "")
	if deno.endswith("Soci") and forme.startswith("été"):
	result["Dénomination"] = deno[:-4].strip()
	result["Forme juridique"] = "Société " + forme[3:].lstrip()

	# Parse after Capital using elimination between Adresse and Durée
	adresse_pattern = r"(\d+\s[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+\s\d{5}\s*[A-Za-zÀ-ÿà-ÿéèêëîïôöùûüç\s-]+?)(?=\s[A-Z]\|$)"
	adresse_match = re.match(adresse_pattern, after, re.I)
	if adresse_match:
	result["Adresse"] = adresse_match.group(1).strip()
	temp_text = after[adresse_match.end():].strip()
	else:
	result["Adresse"] = "Non détecté"
	temp_text = after

	# Now find Durée in temp_text
	duree_pattern = r"(\d+ ans à partir du \d{4}-\d{2}-\d{2})"
	duree_match = re.search(duree_pattern, temp_text, re.I)
	if duree_match:
	result["Durée"] = duree_match.group(1)
	activites = temp_text[:duree_match.start()].strip().rstrip(".")
	result["Objet social"] = activites if activites else "Non détecté"
	date_cloture = temp_text[duree_match.end():].strip()
	result["Date de clôture de l'exercice social"] = date_cloture if date_cloture else "Non détecté"
	else:
	result["Durée"] = "Non détecté"
	result["Objet social"] = "Non détecté"
	result["Date de clôture de l'exercice social"] = "Non détecté"

	# Clean SIREN if extracted
	if result.get("SIREN") != "Non détecté":
	result["SIREN"] = result["SIREN"].replace(" ", "")

	# Date de début d'activité
	date_debut_match = re.search(r"Date début d['’]activité\s+(\d{4}-\d{2}-\d{2})", text)
	result["Date de début d'activité"] = date_debut_match.group(1) if date_debut_match else "Non détecté"

	# Type d'exploitation with improved delimiter
	type_exp_match = re.search(r"Type d['’]exploitation\s+([A-Za-zé\s]+)(?=\s[A-Z][a-z]+\s[A-Z]\|\Z)", text, re.I)
	result["Type d'exploitation"] = type_exp_match.group(1).strip() if type_exp_match else "Non détecté"

	# Persons section (gérants & associés) — unchanged robust logic
	persons_match = re.search(
	r"Gestion, Direction, Administration, Contrôle, Associés ou Membres\s+(.*?)\s+Renseignements sur l['’]établissement principal",
	full_text,
	re.I \| re.DOTALL
	)
	persons_text = persons_match.group(1).strip() if persons_match else ""

	gerants = []
	associes = []

	if persons_text:
	compact = re.finditer(
	r"Qualité\s+(gérant\|Associé)\s+Nom[,;\s]prénoms?\s[:;,]?\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+?)(?:\s+Date et lieu\|\s+Nationalité\|\s+Adresse\|\s+Qualité\|$)",
	persons_text,
	re.I
	)
	for m in compact:
	role = m.group(1).lower()
	name = m.group(2).strip().rstrip(";,.")
	(gerants if "gérant" in role else associes).append(name)

	lines = [l.strip() for l in persons_text.split("\n") if l.strip()]
	i = 0
	while i < len(lines):
	if re.search(r"Qualité\s+Nom[;,]?\s*prénoms?\s+Date et lieu de naissance\s+Nationalité\s+Adresse", lines[i], re.I):
	i += 1
	if i < len(lines):
	match = re.match(
	r"(gérant\|Associé)\s*([A-ZÀÂÄÇÉÈÊËÎÏÔÖÙÛÜŸ][A-Za-zàâäçéèêëîïôöùûüÿ\s,'’;-]+)",
	lines[i],
	re.I
	)
	if match:
	role = match.group(1).lower()
	name = match.group(2).strip().rstrip(";,.")
	(gerants if "gérant" in role else associes).append(name)
	i += 1

	result["Gérant(s)"] = ", ".join(dict.fromkeys(gerants)) if gerants else "Non détecté"
	result["Associé(s)"] = ", ".join(dict.fromkeys(associes)) if associes else "Non détecté"

	# ──────────────────────────────────────────────────────────────
	# Robust parsing of "Renseignements sur l'établissement principal"
	# ──────────────────────────────────────────────────────────────
	etab_section = re.search(
	r"Renseignements\s+sur\s+l['’]établissement\s+principal\s+(.*?)(?=Bulletin\s+Officiel\|$)",
	full_text,
	re.IGNORECASE \| re.DOTALL
	)

	if etab_section:
	block = etab_section.group(1)

	# 1. Adresse établissement principal
	addr_match = re.search(r"Adresse\s+([^\n\r]+?)(?=\s+Date\s+début\|\s+Type\s+d['’]exploitation\|$)", block,
	re.IGNORECASE)
	if addr_match:
	adresse_etab = addr_match.group(1).strip()
	# Remove any accidental trailing punctuation or leftover words
	adresse_etab = re.sub(r"\s+[A-Z][a-z]+$", "", adresse_etab).strip()
	result["Adresse établissement"] = adresse_etab
	else:
	result["Adresse établissement"] = "Non détecté"

	# 2. Type d'exploitation (scoped only to this block to avoid BODACC spillover)
	type_match = re.search(r"Type\s+d['’]exploitation\s+([^\n\r]+)", block, re.IGNORECASE)
	if type_match:
	result["Type d'exploitation"] = type_match.group(1).strip()
	else:
	result["Type d'exploitation"] = "Non détecté"
	else:
	result["Adresse établissement"] = "Non détecté"
	result["Type d'exploitation"] = "Non détecté"

	# Final output
	output = "K-BIS ANALYSE COMPLÈTE\n\n"
	for key, value in result.items():
	output += f"{key}: {value}\n"

	with open(output_file, "w", encoding="utf-8") as f:
	f.write(output)

	print("=== K-BIS ANALYSE COMPLÈTE ===")
	print(output)

	return result
	if __name__ == "__main__":
	parse_full_kbis("kbis_extracted.txt")