|
|
from sentence_transformers import SentenceTransformer |
|
|
import chromadb |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from datetime import datetime |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
|
|
|
CHROMA_DIR = "chroma_db" |
|
|
MODEL_NAME = "mrm8488/multilingual-e5-large-ft-sts-spanish-matryoshka-768-16-5e" |
|
|
COLLECTION_NAMES = ["spc"] |
|
|
|
|
|
|
|
|
print("Conectando a ChromaDB...") |
|
|
client = chromadb.PersistentClient(path=CHROMA_DIR) |
|
|
collections = [client.get_collection(name) for name in COLLECTION_NAMES] |
|
|
|
|
|
|
|
|
print("Cargando modelo...") |
|
|
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True) |
|
|
print("✓ Sistema listo") |
|
|
|
|
|
|
|
|
def semantic_search(query: str, n_results: int = 20): |
|
|
print(f"Buscando: {query}") |
|
|
|
|
|
if not query.strip(): |
|
|
return pd.DataFrame(), "" |
|
|
|
|
|
query_embedding = model.encode(query).tolist() |
|
|
|
|
|
all_results = [] |
|
|
for collection in collections: |
|
|
results = collection.query( |
|
|
query_embeddings=[query_embedding], |
|
|
n_results=n_results, |
|
|
include=["documents", "metadatas", "distances"] |
|
|
) |
|
|
cosine_similarities = [1 - dist for dist in results['distances'][0]] |
|
|
|
|
|
for i in range(len(results['ids'][0])): |
|
|
result_dict = { |
|
|
'Relevante': False, |
|
|
'ID': results['ids'][0][i], |
|
|
'Similitud': round(cosine_similarities[i], 4), |
|
|
'Texto': results['documents'][0][i], |
|
|
'Colección': collection.name |
|
|
} |
|
|
|
|
|
metadata = results['metadatas'][0][i] |
|
|
if metadata: |
|
|
for key, value in metadata.items(): |
|
|
|
|
|
if key.lower() in ['url', 'link', 'enlace'] and value: |
|
|
result_dict[key] = f'<a href="{value}" target="_blank">🔗 Abrir</a>' |
|
|
else: |
|
|
result_dict[key] = value |
|
|
|
|
|
all_results.append(result_dict) |
|
|
|
|
|
df = pd.DataFrame(all_results).sort_values('Similitud', ascending=False).head(n_results) |
|
|
df = df[['Número de Resolución', 'Fecha de Resolución', 'Texto', 'Enlace','Relevante']] |
|
|
print(f"Resultados: {len(df)}") |
|
|
|
|
|
return df, "" |
|
|
|
|
|
|
|
|
def export_to_excel(df_with_selection): |
|
|
if df_with_selection is None or len(df_with_selection) == 0: |
|
|
gr.Warning("No hay datos para exportar") |
|
|
return None |
|
|
|
|
|
|
|
|
df_selected = df_with_selection[df_with_selection['Relevante'] == True].copy() |
|
|
|
|
|
if len(df_selected) == 0: |
|
|
gr.Warning("No has seleccionado ninguna decisión") |
|
|
return None |
|
|
|
|
|
|
|
|
df_export = df_selected.drop(columns=['Relevante']) |
|
|
|
|
|
|
|
|
for col in df_export.columns: |
|
|
if df_export[col].dtype == 'object': |
|
|
df_export[col] = df_export[col].apply( |
|
|
lambda x: x.replace('<a href="', '').replace('" target="_blank">🔗 Abrir</a>', '') |
|
|
if isinstance(x, str) and '<a href=' in x else x |
|
|
) |
|
|
|
|
|
|
|
|
temp_dir = tempfile.gettempdir() |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
filename = os.path.join(temp_dir, f"jurisprudencia_dc_{timestamp}.xlsx") |
|
|
df_export.to_excel(filename, index=False, engine='openpyxl') |
|
|
|
|
|
print(f"✓ Exportado: {filename} ({len(df_export)} registros)") |
|
|
gr.Info(f"✓ {len(df_export)} decisiones exportadas. Descargando...") |
|
|
return filename |
|
|
|
|
|
|
|
|
|
|
|
tema = gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
).set( |
|
|
body_background_fill="*neutral_50", |
|
|
body_background_fill_dark="*neutral_50", |
|
|
block_background_fill="*neutral_50", |
|
|
block_background_fill_dark="*neutral_50", |
|
|
input_background_fill="white", |
|
|
input_background_fill_dark="white", |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Buscador Jurisprudencia") as demo: |
|
|
gr.Markdown("# 🔍 Experto en Protección al Consumidor") |
|
|
gr.Markdown("Realiza tu consulta preguntando por temas o introduciendo los hechos del caso") |
|
|
|
|
|
|
|
|
query_box = gr.Textbox( |
|
|
show_label=False, |
|
|
placeholder="Escribe tu consulta y presiona Enter...", |
|
|
lines=1 |
|
|
) |
|
|
|
|
|
|
|
|
results_table = gr.Dataframe( |
|
|
label="Resultados", |
|
|
wrap=True, |
|
|
interactive=True, |
|
|
datatype=["number", "number", "str", "markdown", 'bool'], |
|
|
max_height=600, |
|
|
column_widths=["7%", "7%", "55%", "7%", "7%"] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(elem_id="export-row"): |
|
|
export_button = gr.Button("📥 Exportar a Excel", variant="primary", size="sm", elem_id="export-btn") |
|
|
|
|
|
|
|
|
file_output = gr.File(label="", visible=False) |
|
|
|
|
|
|
|
|
query_box.submit( |
|
|
semantic_search, |
|
|
[query_box], |
|
|
[results_table, query_box] |
|
|
) |
|
|
|
|
|
export_button.click( |
|
|
export_to_excel, |
|
|
[results_table], |
|
|
[file_output] |
|
|
) |
|
|
|
|
|
demo.launch( |
|
|
theme=tema, |
|
|
css=""" |
|
|
/* Bordes gruesos */ |
|
|
.svelte-u825rv td, |
|
|
.svelte-u825rv th { |
|
|
border: 3px solid #888888 !important; |
|
|
padding: 12px !important; |
|
|
} |
|
|
|
|
|
.svelte-u825rv table { |
|
|
border-collapse: collapse !important; |
|
|
} |
|
|
|
|
|
/* Headers - targetear el span interno */ |
|
|
.svelte-u825rv th .svelte-8fgf56.multiline.text { |
|
|
white-space: normal !important; |
|
|
word-wrap: break-word !important; |
|
|
text-align: center !important; |
|
|
display: block !important; |
|
|
line-height: 1.4 !important; |
|
|
} |
|
|
|
|
|
/* Centrar el header button también */ |
|
|
.svelte-u825rv th .header-button { |
|
|
text-align: center !important; |
|
|
justify-content: center !important; |
|
|
} |
|
|
|
|
|
/* Checkbox con más sombra */ |
|
|
input[type="checkbox"] { |
|
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3) !important; |
|
|
transform: scale(1.2); |
|
|
} |
|
|
|
|
|
/* Botón */ |
|
|
#export-row { |
|
|
justify-content: flex-end; |
|
|
margin-top: 10px; |
|
|
} |
|
|
|
|
|
#export-btn { |
|
|
max-width: 200px; |
|
|
min-width: 150px; |
|
|
} |
|
|
""" |
|
|
) |