from sentence_transformers import SentenceTransformer
import chromadb
import pandas as pd
import gradio as gr
from datetime import datetime
import os
from huggingface_hub import hf_hub_download, snapshot_download
import chromadb
import pandas as pd
#Código nuevo
# Download the entire dataset (gets all files/folders)
dataset_path = snapshot_download(
repo_id="alexis07/oefa_21_26",
repo_type="dataset"
)
# Load text data from pickle
textos = pd.read_pickle(f"{dataset_path}/oefa_21_26.pkl")
metadata = pd.read_csv(f"{dataset_path}/dfai_0421_0226.csv")
metadata = metadata.drop(columns=['Link', 'pdf_urls'])
metadata = pd.merge(textos, metadata, on="ID", how="inner")
metadata['ID'] = metadata['ID'].astype(str) + '_' + textos['Page'].astype(str)
# Set ID as index for faster lookup
metadata.set_index('ID', inplace=True)
# ===== 1. Conectar a la base de datos y cargar colecciones =====
print("Conectando a ChromaDB...")
# Load Chroma DB from the chroma_db subfolder
CHROMA_DIR = f"{dataset_path}/local_chroma_dir"
MODEL_NAME = "nomic-ai/nomic-embed-text-v2-moe"
COLLECTION_NAMES = ["oefa"] # Example names
client = chromadb.PersistentClient(path=CHROMA_DIR)
collections = [client.get_collection(name) for name in COLLECTION_NAMES]
# ===== 2. Cargar el modelo =====
print("Cargando modelo...")
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)
print("✓ Sistema listo")
# ===== 3. Función de búsqueda y renderizado de Cards =====
def semantic_search(query: str, n_results: int = 15):
print(f"Buscando: {query}")
if not query.strip():
return "### Por favor, introduce una consulta para comenzar.", ""
query_embedding = model.encode(query, prompt_name="query").tolist()
all_results = []
for collection in collections:
results = collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
include=["distances"]
)
cosine_similarities = [1 - dist for dist in results['distances'][0]]
for i in range(len(results['ids'][0])):
chroma_id = results['ids'][0][i]
# Retrieve metadata from the external DataFrame using the ID
try:
if chroma_id in metadata.index:
row_metadata = metadata.loc[chroma_id]
res = {
'id': chroma_id,
'similitud': cosine_similarities[i],
'enlace': row_metadata.get('Enlace', '#') if hasattr(row_metadata, 'get') else row_metadata['Enlace'],
'text': row_metadata.get('Text', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Text'],
'date': row_metadata.get('Date', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Date'],
'page': row_metadata.get('Page', 'N/A') if hasattr(row_metadata, 'get') else row_metadata['Page']
}
all_results.append(res)
else:
print(f"Warning: ID {chroma_id} not found in metadata DataFrame")
except Exception as e:
print(f"Error retrieving metadata for ID {chroma_id}: {str(e)}")
# Ordenar por similitud
all_results = sorted(all_results, key=lambda x: x['similitud'], reverse=True)[:n_results]
if not all_results:
return "### No se encontraron resultados relevantes.", ""
# Generar HTML de las Cards
html_output = '
'
for item in all_results:
similitud_pct = f"{item['similitud']*100:.1f}%"
enlace = item.get('enlace', '#')
fecha = item.get('date', 'N/A')
texto = item.get('text', 'N/A').replace('\n', ' ') # Remove newlines from text
pagina = item.get('page', 'N/A')
html_output += f"""
"""
html_output += '
'
return html_output, ""
# ===== 4. Interfaz Gradio =====
# Definición del CSS con soporte para Dark Mode
custom_css = """
/* Light Mode (Default) */
.results-container {
display: flex;
flex-direction: column;
gap: 20px;
padding: 10px 0;
}
.legal-card {
background: white;
border-radius: 12px;
border: 1px solid #e5e7eb;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
transition: transform 0.2s ease, box-shadow 0.2s ease;
overflow: hidden;
}
.legal-card:hover {
transform: translateY(-3px);
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1);
border-color: #3b82f6;
}
.card-header {
background: #f8fafc;
padding: 12px 20px;
border-bottom: 1px solid #e5e7eb;
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 15px;
}
.res-number, .res-date {
font-size: 14px;
color: #334155;
}
.res-score {
background: #3b82f6;
color: #ffffff;
padding: 4px 12px;
border-radius: 20px;
font-weight: 600;
font-size: 13px;
}
.card-body {
padding: 20px;
}
.res-summary {
color: #334155;
line-height: 1.7;
font-size: 15px;
margin: 0;
text-align: justify;
white-space: pre-wrap;
background-color: #f9fafb;
padding: 12px;
border-left: 3px solid #3b82f6;
border-radius: 4px;
}
.card-footer {
padding: 12px 20px;
background: #ffffff;
border-top: 1px dashed #e5e7eb;
text-align: right;
}
.view-link {
color: #2563eb;
text-decoration: none;
font-weight: 600;
font-size: 14px;
}
.view-link:hover {
text-decoration: underline;
color: #1d4ed8;
}
/* Dark Mode */
@media (prefers-color-scheme: dark) {
.legal-card {
background: #1f2937;
border: 1px solid #374151;
}
.card-header {
background: #111827;
border-bottom: 1px solid #374151;
}
.res-number, .res-date {
color: #e5e7eb;
}
.res-score {
background: #3b82f6;
color: #ffffff;
}
.res-summary {
color: #e5e7eb;
background-color: #111827;
border-left: 3px solid #60a5fa;
}
.card-footer {
background: #1f2937;
border-top: 1px dashed #374151;
}
.view-link {
color: #60a5fa;
}
.view-link:hover {
color: #93c5fd;
}
}
"""
with gr.Blocks(title="Buscador Jurisprudencia Pro", theme=gr.themes.Soft(primary_hue="blue"), css=custom_css) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("""
# ⚖️ Buscador de Jurisprudencia Profesional
### Inteligencia Artificial aplicada al Derecho Ambiental
""")
with gr.Row():
query_box = gr.Textbox(
show_label=False,
placeholder="Ej: ¿Cuáles son los requisitos para la subsanación voluntaria?",
lines=1,
container=False
)
with gr.Row():
# Usamos un componente HTML para mostrar las Cards
results_display = gr.HTML(
label="Resultados",
value='Los resultados aparecerán aquí...
'
)
# Eventos
query_box.submit(
semantic_search,
[query_box],
[results_display, query_box]
).then(lambda: "", None, query_box)
if __name__ == "__main__":
demo.launch()