Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

AutomatedSemanticDiscovery / src /ingestion /semantic_splitter.py

GaetanoParente

rimossi import inutili e blindato utilizzo utente

9cbbfac about 1 month ago

raw

history blame contribute delete

5.97 kB

	import re
	import numpy as np
	import nltk
	from sklearn.metrics.pairwise import cosine_similarity
	from dotenv import load_dotenv
	from langchain_huggingface import HuggingFaceEmbeddings

	# Carico l'ambiente. Su HF Spaces andrà a pescare dai secrets, in locale dal .env
	load_dotenv()

	class ActivaSemanticSplitter:
	def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
	self.batch_size = batch_size

	print("🔄 Inizializzazione HuggingFace Embedding Engine...")

	# Scelto MiniLM-L6: per questo prototipo ci serve un modello veloce e leggero in RAM
	# che non faccia da collo di bottiglia durante l'ingestion massiva di documenti.
	try:
	self.embedding_model = HuggingFaceEmbeddings(model_name=model_name)
	print("✅ Modello caricato correttamente.")
	except Exception as e:
	print(f"❌ Errore caricamento modello: {e}")
	raise e

	# Check preventivo sui tokenizer.
	try:
	nltk.data.find('tokenizers/punkt')
	nltk.data.find('tokenizers/punkt_tab')
	except LookupError:
	print("⬇️ Download risorse NLTK...")
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)

	def _split_sentences(self, text):
	# La pulizia base. Fondamentale per i testi estratti da vecchi OCR o documenti sporchi.
	text = text.strip()
	try:
	# Recupero il tokenizer dell'italiano. Evito sent_tokenize() puro perché è una black box
	# e mi serve poter iniettare eccezioni custom per la punteggiatura.
	try:
	tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
	except:
	# Fallback di sicurezza se il path del pickle salta
	from nltk.tokenize.punkt import PunktSentenceTokenizer
	tokenizer = PunktSentenceTokenizer()

	# --- LISTA ECCEZIONI ABBREVIAZIONI ---
	# Evito che il chunker mi spezzi la frase a metà quando incontra "pag." o "art."
	# cosa che distruggerebbe il senso semantico prima ancora di passare all'LLM.
	custom_abbrevs = ['sec', 's', 'prof', 'dott', 'avv', 'pag', 'fig', 'nr', 'art']
	for abbr in custom_abbrevs:
	tokenizer._params.abbrev_types.add(abbr)

	sentences = tokenizer.tokenize(text)

	except ImportError:
	print("⚠️ NLTK non installato. Fallback su Regex semplice.")
	sentences = re.split(r'(?<=[.?!])\s+', text)
	except Exception as e:
	print(f"⚠️ Errore NLTK ({e}). Fallback su Regex.")
	sentences = re.split(r'(?<=[.?!])\s+', text)

	# Filtro via il rumore di fondo (stringhe troppo corte o spazi rimasti appesi)
	return [s.strip() for s in sentences if len(s.strip()) > 5]

	def combine_sentences(self, sentences, buffer_size=1):
	# Sliding window per dare contesto: embeddare una frase singola tipo "Di conseguenza." non ha senso vettoriale.
	# Le affianco la frase prima e quella dopo per "spalmare" il significato
	# ed evitare che una frase breve sballi il calcolo del coseno.
	combined = []
	for i in range(len(sentences)):
	start = max(0, i - buffer_size)
	end = min(len(sentences), i + 1 + buffer_size)
	combined_context = " ".join(sentences[start:end])
	combined.append(combined_context)
	return combined

	def calculate_cosine_distances(self, sentences):
	# Embeddo tutto in batch. Se arrivano malloppi enormi da estrarre non voglio saturare la memoria.
	embeddings = []
	total = len(sentences)

	for i in range(0, total, self.batch_size):
	batch = sentences[i : i + self.batch_size]
	batch_embeddings = self.embedding_model.embed_documents(batch)
	embeddings.extend(batch_embeddings)

	# Calcolo le distanze sequenziali tra la frase N e la frase N+1
	distances = []
	for i in range(len(embeddings) - 1):
	similarity = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
	# Inverto la similarità in distanza (0 = concetti identici, 1 = cambio totale di argomento)
	distance = 1.0 - similarity
	distances.append(distance)

	return distances, embeddings

	def create_chunks(self, text, percentile_threshold=95):
	single_sentences = self._split_sentences(text)
	if not single_sentences:
	return [], [], 0

	combined_sentences = self.combine_sentences(single_sentences)
	distances, _ = self.calculate_cosine_distances(combined_sentences)

	if not distances:
	# Testo troppo breve per essere splittato, lo tengo intero
	return [text], [], 0

	# Calcolo la soglia di taglio dinamicamente in base alle variazioni semantiche del documento stesso.
	threshold = np.percentile(distances, percentile_threshold)

	# Individuo i "punti di rottura" dove l'argomento cambia radicalmente
	indices_above_thresh = [i for i, x in enumerate(distances) if x > threshold]

	chunks = []
	start_index = 0
	breakpoints = indices_above_thresh + [len(single_sentences)]

	# Ricostruisco i paragrafi unendo le frasi originali (non quelle col buffer)
	# delimitandole dai punti di rottura che abbiamo appena trovato.
	for i in breakpoints:
	end_index = i + 1
	chunk_text = " ".join(single_sentences[start_index:end_index])
	if len(chunk_text) > 20: # Salto micro-frammenti spazzatura (es. singole parole o punteggiatura)
	chunks.append(chunk_text)
	start_index = end_index

	return chunks, distances, threshold