from sentence_transformers import SentenceTransformer import chromadb import pandas as pd import gradio as gr from datetime import datetime import tempfile import os # ===== Configuración ===== CHROMA_DIR = "chroma_db" MODEL_NAME = "mrm8488/multilingual-e5-large-ft-sts-spanish-matryoshka-768-16-5e" COLLECTION_NAMES = ["spc"] # ===== 1. Conectar a la base de datos y cargar colecciones ===== print("Conectando a ChromaDB...") client = chromadb.PersistentClient(path=CHROMA_DIR) collections = [client.get_collection(name) for name in COLLECTION_NAMES] # ===== 2. Cargar el modelo ===== print("Cargando modelo...") model = SentenceTransformer(MODEL_NAME, trust_remote_code=True) print("✓ Sistema listo") # ===== 3. Función de búsqueda ===== def semantic_search(query: str, n_results: int = 20): print(f"Buscando: {query}") if not query.strip(): return pd.DataFrame(), "" query_embedding = model.encode(query).tolist() all_results = [] for collection in collections: results = collection.query( query_embeddings=[query_embedding], n_results=n_results, include=["documents", "metadatas", "distances"] ) cosine_similarities = [1 - dist for dist in results['distances'][0]] for i in range(len(results['ids'][0])): result_dict = { 'Relevante': False, # Columna de selección al inicio 'ID': results['ids'][0][i], 'Similitud': round(cosine_similarities[i], 4), 'Texto': results['documents'][0][i], 'Colección': collection.name } metadata = results['metadatas'][0][i] if metadata: for key, value in metadata.items(): # Si hay URL, convertir a link if key.lower() in ['url', 'link', 'enlace'] and value: result_dict[key] = f'🔗 Abrir' else: result_dict[key] = value all_results.append(result_dict) df = pd.DataFrame(all_results).sort_values('Similitud', ascending=False).head(n_results) df = df[['Número de Resolución', 'Fecha de Resolución', 'Texto', 'Enlace','Relevante']] print(f"Resultados: {len(df)}") return df, "" # ===== 4. Función para exportar seleccionados a Excel ===== def export_to_excel(df_with_selection): if df_with_selection is None or len(df_with_selection) == 0: gr.Warning("No hay datos para exportar") return None # Filtrar solo los marcados como True en la columna 'Relevante' df_selected = df_with_selection[df_with_selection['Relevante'] == True].copy() if len(df_selected) == 0: gr.Warning("No has seleccionado ninguna decisión") return None # Quitar la columna de checkbox del export df_export = df_selected.drop(columns=['Relevante']) # Limpiar HTML de los links para Excel for col in df_export.columns: if df_export[col].dtype == 'object': df_export[col] = df_export[col].apply( lambda x: x.replace('🔗 Abrir', '') if isinstance(x, str) and '