|
|
from sentence_transformers import SentenceTransformer |
|
|
import chromadb |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from datetime import datetime |
|
|
import os |
|
|
from huggingface_hub import hf_hub_download, snapshot_download |
|
|
import chromadb |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset_path = snapshot_download( |
|
|
repo_id="alexis07/oefa_21_26", |
|
|
repo_type="dataset" |
|
|
) |
|
|
|
|
|
|
|
|
textos = pd.read_pickle(f"{dataset_path}/oefa_21_26.pkl") |
|
|
metadata = pd.read_csv(f"{dataset_path}/dfai_0421_0226.csv") |
|
|
metadata = metadata.drop(columns=['Link', 'pdf_urls']) |
|
|
metadata = pd.merge(textos, metadata, on="ID", how="inner") |
|
|
metadata['ID'] = metadata['ID'].astype(str) + '_' + textos['Page'].astype(str) |
|
|
|
|
|
|
|
|
metadata.set_index('ID', inplace=True) |
|
|
|
|
|
|
|
|
print("Conectando a ChromaDB...") |
|
|
|
|
|
|
|
|
CHROMA_DIR = f"{dataset_path}/local_chroma_dir" |
|
|
MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe" |
|
|
COLLECTION_NAMES = ["oefa"] |
|
|
|
|
|
client = chromadb.PersistentClient(path=CHROMA_DIR) |
|
|
collections = [client.get_collection(name) for name in COLLECTION_NAMES] |
|
|
|
|
|
|
|
|
print("Cargando modelo...") |
|
|
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True) |
|
|
print("✓ Sistema listo") |
|
|
|
|
|
|
|
|
def semantic_search(query: str, n_results: int = 15): |
|
|
print(f"Buscando: {query}") |
|
|
|
|
|
if not query.strip(): |
|
|
return "### Por favor, introduce una consulta para comenzar.", "" |
|
|
|
|
|
query_embedding = model.encode(query, prompt_name="query").tolist() |
|
|
|
|
|
all_results = [] |
|
|
for collection in collections: |
|
|
results = collection.query( |
|
|
query_embeddings=[query_embedding], |
|
|
n_results=n_results, |
|
|
include=["distances"] |
|
|
) |
|
|
cosine_similarities = [1 - dist for dist in results['distances'][0]] |
|
|
|
|
|
for i in range(len(results['ids'][0])): |
|
|
chroma_id = results['ids'][0][i] |
|
|
|
|
|
|
|
|
try: |
|
|
if chroma_id in metadata.index: |
|
|
row_metadata = metadata.loc[chroma_id] |
|
|
|
|
|
res = { |
|
|
'id': chroma_id, |
|
|
'similitud': cosine_similarities[i], |
|
|
'enlace': row_metadata.get('Enlace', '#') if hasattr(row_metadata, 'get') else row_metadata['Enlace'], |
|
|
'text': row_metadata.get('Text', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Text'], |
|
|
'date': row_metadata.get('Date', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Date'], |
|
|
'page': row_metadata.get('Page', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Page'] |
|
|
} |
|
|
all_results.append(res) |
|
|
else: |
|
|
print(f"Warning: ID {chroma_id} not found in metadata DataFrame") |
|
|
except Exception as e: |
|
|
print(f"Error retrieving metadata for ID {chroma_id}: {str(e)}") |
|
|
|
|
|
|
|
|
all_results = sorted(all_results, key=lambda x: x['similitud'], reverse=True)[:n_results] |
|
|
|
|
|
if not all_results: |
|
|
return "### No se encontraron resultados relevantes.", "" |
|
|
|
|
|
|
|
|
html_output = '<div class="results-container">' |
|
|
for item in all_results: |
|
|
similitud_pct = f"{item['similitud']*100:.1f}%" |
|
|
enlace = item.get('enlace', '#') |
|
|
fecha = item.get('date', 'N/A') |
|
|
texto = item.get('text', 'N/A').replace('\n', ' ') |
|
|
pagina = item.get('page', 'N/A') |
|
|
|
|
|
html_output += f""" |
|
|
<div class="legal-card"> |
|
|
<div class="card-header"> |
|
|
<span class="res-number">⚖️ Página: {pagina}</span> |
|
|
<span class="res-date">📅 {fecha}</span> |
|
|
<span class="res-score">🎯 Relevancia: {similitud_pct}</span> |
|
|
</div> |
|
|
<div class="card-body"> |
|
|
<p class="res-summary">{texto}</p> |
|
|
</div> |
|
|
<div class="card-footer"> |
|
|
<a href="{enlace}" target="_blank" class="view-link">🔗 Ver Documento Completo</a> |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
html_output += '</div>' |
|
|
|
|
|
return html_output, "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
/* Light Mode (Default) */ |
|
|
.results-container { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 20px; |
|
|
padding: 10px 0; |
|
|
} |
|
|
.legal-card { |
|
|
background: white; |
|
|
border-radius: 12px; |
|
|
border: 1px solid #e5e7eb; |
|
|
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); |
|
|
transition: transform 0.2s ease, box-shadow 0.2s ease; |
|
|
overflow: hidden; |
|
|
} |
|
|
.legal-card:hover { |
|
|
transform: translateY(-3px); |
|
|
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1); |
|
|
border-color: #3b82f6; |
|
|
} |
|
|
.card-header { |
|
|
background: #f8fafc; |
|
|
padding: 12px 20px; |
|
|
border-bottom: 1px solid #e5e7eb; |
|
|
display: flex; |
|
|
justify-content: space-between; |
|
|
align-items: center; |
|
|
flex-wrap: wrap; |
|
|
gap: 15px; |
|
|
} |
|
|
.res-number, .res-date { |
|
|
font-size: 14px; |
|
|
color: #334155; |
|
|
} |
|
|
.res-score { |
|
|
background: #3b82f6; |
|
|
color: #ffffff; |
|
|
padding: 4px 12px; |
|
|
border-radius: 20px; |
|
|
font-weight: 600; |
|
|
font-size: 13px; |
|
|
} |
|
|
.card-body { |
|
|
padding: 20px; |
|
|
} |
|
|
.res-summary { |
|
|
color: #334155; |
|
|
line-height: 1.7; |
|
|
font-size: 15px; |
|
|
margin: 0; |
|
|
text-align: justify; |
|
|
white-space: pre-wrap; |
|
|
background-color: #f9fafb; |
|
|
padding: 12px; |
|
|
border-left: 3px solid #3b82f6; |
|
|
border-radius: 4px; |
|
|
} |
|
|
.card-footer { |
|
|
padding: 12px 20px; |
|
|
background: #ffffff; |
|
|
border-top: 1px dashed #e5e7eb; |
|
|
text-align: right; |
|
|
} |
|
|
.view-link { |
|
|
color: #2563eb; |
|
|
text-decoration: none; |
|
|
font-weight: 600; |
|
|
font-size: 14px; |
|
|
} |
|
|
.view-link:hover { |
|
|
text-decoration: underline; |
|
|
color: #1d4ed8; |
|
|
} |
|
|
|
|
|
/* Dark Mode */ |
|
|
@media (prefers-color-scheme: dark) { |
|
|
.legal-card { |
|
|
background: #1f2937; |
|
|
border: 1px solid #374151; |
|
|
} |
|
|
.card-header { |
|
|
background: #111827; |
|
|
border-bottom: 1px solid #374151; |
|
|
} |
|
|
.res-number, .res-date { |
|
|
color: #e5e7eb; |
|
|
} |
|
|
.res-score { |
|
|
background: #3b82f6; |
|
|
color: #ffffff; |
|
|
} |
|
|
.res-summary { |
|
|
color: #e5e7eb; |
|
|
background-color: #111827; |
|
|
border-left: 3px solid #60a5fa; |
|
|
} |
|
|
.card-footer { |
|
|
background: #1f2937; |
|
|
border-top: 1px dashed #374151; |
|
|
} |
|
|
.view-link { |
|
|
color: #60a5fa; |
|
|
} |
|
|
.view-link:hover { |
|
|
color: #93c5fd; |
|
|
} |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title="Buscador Jurisprudencia Pro", theme=gr.themes.Soft(primary_hue="blue"), css=custom_css) as demo: |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown(""" |
|
|
# ⚖️ Buscador de Jurisprudencia Profesional |
|
|
### Inteligencia Artificial aplicada al Derecho Ambiental |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
query_box = gr.Textbox( |
|
|
show_label=False, |
|
|
placeholder="Ej: ¿Cuáles son los requisitos para la subsanación voluntaria?", |
|
|
lines=1, |
|
|
container=False |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
results_display = gr.HTML( |
|
|
label="Resultados", |
|
|
value='<div style="text-align: center; color: #64748b; padding: 40px;">Los resultados aparecerán aquí...</div>' |
|
|
) |
|
|
|
|
|
|
|
|
query_box.submit( |
|
|
semantic_search, |
|
|
[query_box], |
|
|
[results_display, query_box] |
|
|
).then(lambda: "", None, query_box) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|