Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

File size: 5,966 Bytes

import re
import numpy as np
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings

# Carico l'ambiente. Su HF Spaces andrà a pescare dai secrets, in locale dal .env
load_dotenv()

class ActivaSemanticSplitter:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
        self.batch_size = batch_size
        
        print("🔄 Inizializzazione HuggingFace Embedding Engine...")
        
        # Scelto MiniLM-L6: per questo prototipo ci serve un modello veloce e leggero in RAM 
        # che non faccia da collo di bottiglia durante l'ingestion massiva di documenti.
        try:
            self.embedding_model = HuggingFaceEmbeddings(model_name=model_name)    
            print("✅ Modello caricato correttamente.")
        except Exception as e:
            print(f"❌ Errore caricamento modello: {e}")
            raise e

        # Check preventivo sui tokenizer.
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('tokenizers/punkt_tab')
        except LookupError:
            print("⬇️ Download risorse NLTK...")
            nltk.download('punkt', quiet=True)
            nltk.download('punkt_tab', quiet=True)

    def _split_sentences(self, text):
        # La pulizia base. Fondamentale per i testi estratti da vecchi OCR o documenti sporchi.
        text = text.strip()
        try:
            # Recupero il tokenizer dell'italiano. Evito sent_tokenize() puro perché è una black box
            # e mi serve poter iniettare eccezioni custom per la punteggiatura.
            try:
                tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
            except:
                # Fallback di sicurezza se il path del pickle salta
                from nltk.tokenize.punkt import PunktSentenceTokenizer
                tokenizer = PunktSentenceTokenizer()

            # --- LISTA ECCEZIONI ABBREVIAZIONI ---
            # Evito che il chunker mi spezzi la frase a metà quando incontra "pag." o "art."
            # cosa che distruggerebbe il senso semantico prima ancora di passare all'LLM.
            custom_abbrevs = ['sec', 's', 'prof', 'dott', 'avv', 'pag', 'fig', 'nr', 'art']
            for abbr in custom_abbrevs:
                tokenizer._params.abbrev_types.add(abbr)

            sentences = tokenizer.tokenize(text)
            
        except ImportError:
            print("⚠️ NLTK non installato. Fallback su Regex semplice.")
            sentences = re.split(r'(?<=[.?!])\s+', text)
        except Exception as e:
            print(f"⚠️ Errore NLTK ({e}). Fallback su Regex.")
            sentences = re.split(r'(?<=[.?!])\s+', text)
            
        # Filtro via il rumore di fondo (stringhe troppo corte o spazi rimasti appesi)
        return [s.strip() for s in sentences if len(s.strip()) > 5]

    def combine_sentences(self, sentences, buffer_size=1):
        # Sliding window per dare contesto: embeddare una frase singola tipo "Di conseguenza."  non ha senso vettoriale. 
        # Le affianco la frase prima e quella dopo per "spalmare" il significato
        # ed evitare che una frase breve sballi il calcolo del coseno.
        combined = []
        for i in range(len(sentences)):
            start = max(0, i - buffer_size)
            end = min(len(sentences), i + 1 + buffer_size)
            combined_context = " ".join(sentences[start:end])
            combined.append(combined_context)
        return combined

    def calculate_cosine_distances(self, sentences):
        # Embeddo tutto in batch. Se arrivano malloppi enormi da estrarre non voglio saturare la memoria.
        embeddings = []
        total = len(sentences)
        
        for i in range(0, total, self.batch_size):
            batch = sentences[i : i + self.batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch)
            embeddings.extend(batch_embeddings)

        # Calcolo le distanze sequenziali tra la frase N e la frase N+1
        distances = []
        for i in range(len(embeddings) - 1):
            similarity = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
            # Inverto la similarità in distanza (0 = concetti identici, 1 = cambio totale di argomento)
            distance = 1.0 - similarity 
            distances.append(distance)
            
        return distances, embeddings

    def create_chunks(self, text, percentile_threshold=95):
        single_sentences = self._split_sentences(text)
        if not single_sentences:
            return [], [], 0

        combined_sentences = self.combine_sentences(single_sentences)
        distances, _ = self.calculate_cosine_distances(combined_sentences)
        
        if not distances:
            # Testo troppo breve per essere splittato, lo tengo intero
            return [text], [], 0

        # Calcolo la soglia di taglio dinamicamente in base alle variazioni semantiche del documento stesso.
        threshold = np.percentile(distances, percentile_threshold)
        
        # Individuo i "punti di rottura" dove l'argomento cambia radicalmente
        indices_above_thresh = [i for i, x in enumerate(distances) if x > threshold]
        
        chunks = []
        start_index = 0
        breakpoints = indices_above_thresh + [len(single_sentences)]

        # Ricostruisco i paragrafi unendo le frasi originali (non quelle col buffer) 
        # delimitandole dai punti di rottura che abbiamo appena trovato.
        for i in breakpoints:
            end_index = i + 1
            chunk_text = " ".join(single_sentences[start_index:end_index])
            if len(chunk_text) > 20: # Salto micro-frammenti spazzatura (es. singole parole o punteggiatura)
                chunks.append(chunk_text)
            start_index = end_index
            
        return chunks, distances, threshold