from sentence_transformers import SentenceTransformer import chromadb import pandas as pd import gradio as gr from datetime import datetime import os from huggingface_hub import hf_hub_download, snapshot_download import chromadb import pandas as pd #Código nuevo # Download the entire dataset (gets all files/folders) dataset_path = snapshot_download( repo_id="alexis07/oefa_21_26", repo_type="dataset" ) # Load text data from pickle textos = pd.read_pickle(f"{dataset_path}/oefa_21_26.pkl") metadata = pd.read_csv(f"{dataset_path}/dfai_0421_0226.csv") metadata = metadata.drop(columns=['Link', 'pdf_urls']) metadata = pd.merge(textos, metadata, on="ID", how="inner") metadata['ID'] = metadata['ID'].astype(str) + '_' + textos['Page'].astype(str) # Set ID as index for faster lookup metadata.set_index('ID', inplace=True) # ===== 1. Conectar a la base de datos y cargar colecciones ===== print("Conectando a ChromaDB...") # Load Chroma DB from the chroma_db subfolder CHROMA_DIR = f"{dataset_path}/local_chroma_dir" MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe" COLLECTION_NAMES = ["oefa"] # Example names client = chromadb.PersistentClient(path=CHROMA_DIR) collections = [client.get_collection(name) for name in COLLECTION_NAMES] # ===== 2. Cargar el modelo ===== print("Cargando modelo...") model = SentenceTransformer(MODEL_NAME, trust_remote_code=True) print("✓ Sistema listo") # ===== 3. Función de búsqueda y renderizado de Cards ===== def semantic_search(query: str, n_results: int = 15): print(f"Buscando: {query}") if not query.strip(): return "### Por favor, introduce una consulta para comenzar.", "" query_embedding = model.encode(query, prompt_name="query").tolist() all_results = [] for collection in collections: results = collection.query( query_embeddings=[query_embedding], n_results=n_results, include=["distances"] ) cosine_similarities = [1 - dist for dist in results['distances'][0]] for i in range(len(results['ids'][0])): chroma_id = results['ids'][0][i] # Retrieve metadata from the external DataFrame using the ID try: if chroma_id in metadata.index: row_metadata = metadata.loc[chroma_id] res = { 'id': chroma_id, 'similitud': cosine_similarities[i], 'enlace': row_metadata.get('Enlace', '#') if hasattr(row_metadata, 'get') else row_metadata['Enlace'], 'text': row_metadata.get('Text', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Text'], 'date': row_metadata.get('Date', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Date'], 'page': row_metadata.get('Page', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Page'] } all_results.append(res) else: print(f"Warning: ID {chroma_id} not found in metadata DataFrame") except Exception as e: print(f"Error retrieving metadata for ID {chroma_id}: {str(e)}") # Ordenar por similitud all_results = sorted(all_results, key=lambda x: x['similitud'], reverse=True)[:n_results] if not all_results: return "### No se encontraron resultados relevantes.", "" # Generar HTML de las Cards html_output = '
' for item in all_results: similitud_pct = f"{item['similitud']*100:.1f}%" enlace = item.get('enlace', '#') fecha = item.get('date', 'N/A') texto = item.get('text', 'N/A').replace('\n', ' ') # Remove newlines from text pagina = item.get('page', 'N/A') html_output += f""" """ html_output += '
' return html_output, "" # ===== 4. Interfaz Gradio ===== # Definición del CSS con soporte para Dark Mode custom_css = """ /* Light Mode (Default) */ .results-container { display: flex; flex-direction: column; gap: 20px; padding: 10px 0; } .legal-card { background: white; border-radius: 12px; border: 1px solid #e5e7eb; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); transition: transform 0.2s ease, box-shadow 0.2s ease; overflow: hidden; } .legal-card:hover { transform: translateY(-3px); box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1); border-color: #3b82f6; } .card-header { background: #f8fafc; padding: 12px 20px; border-bottom: 1px solid #e5e7eb; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 15px; } .res-number, .res-date { font-size: 14px; color: #334155; } .res-score { background: #3b82f6; color: #ffffff; padding: 4px 12px; border-radius: 20px; font-weight: 600; font-size: 13px; } .card-body { padding: 20px; } .res-summary { color: #334155; line-height: 1.7; font-size: 15px; margin: 0; text-align: justify; white-space: pre-wrap; background-color: #f9fafb; padding: 12px; border-left: 3px solid #3b82f6; border-radius: 4px; } .card-footer { padding: 12px 20px; background: #ffffff; border-top: 1px dashed #e5e7eb; text-align: right; } .view-link { color: #2563eb; text-decoration: none; font-weight: 600; font-size: 14px; } .view-link:hover { text-decoration: underline; color: #1d4ed8; } /* Dark Mode */ @media (prefers-color-scheme: dark) { .legal-card { background: #1f2937; border: 1px solid #374151; } .card-header { background: #111827; border-bottom: 1px solid #374151; } .res-number, .res-date { color: #e5e7eb; } .res-score { background: #3b82f6; color: #ffffff; } .res-summary { color: #e5e7eb; background-color: #111827; border-left: 3px solid #60a5fa; } .card-footer { background: #1f2937; border-top: 1px dashed #374151; } .view-link { color: #60a5fa; } .view-link:hover { color: #93c5fd; } } """ with gr.Blocks(title="Buscador Jurisprudencia Pro", theme=gr.themes.Soft(primary_hue="blue"), css=custom_css) as demo: with gr.Row(): with gr.Column(scale=1): gr.Markdown(""" # ⚖️ Buscador de Jurisprudencia Profesional ### Inteligencia Artificial aplicada al Derecho Ambiental """) with gr.Row(): query_box = gr.Textbox( show_label=False, placeholder="Ej: ¿Cuáles son los requisitos para la subsanación voluntaria?", lines=1, container=False ) with gr.Row(): # Usamos un componente HTML para mostrar las Cards results_display = gr.HTML( label="Resultados", value='
Los resultados aparecerán aquí...
' ) # Eventos query_box.submit( semantic_search, [query_box], [results_display, query_box] ).then(lambda: "", None, query_box) if __name__ == "__main__": demo.launch()