Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,55 +1,50 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
-
import re
|
| 3 |
-
import unicodedata
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Tuple
|
| 6 |
-
|
| 7 |
import gradio as gr
|
| 8 |
import joblib
|
| 9 |
import pandas as pd
|
| 10 |
from scipy import sparse
|
| 11 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
#
|
| 15 |
-
#
|
| 16 |
ROOT = Path(__file__).parent
|
| 17 |
ART = ROOT / "artifacts"
|
| 18 |
VEC_PATH = ART / "tfidf_vectorizer.joblib"
|
| 19 |
MAT_PATH = ART / "tfidf_matrix.npz"
|
| 20 |
IDX_PATH = ART / "doc_index.csv"
|
|
|
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
#
|
| 24 |
-
#
|
| 25 |
def strip_accents(s: str) -> str:
|
| 26 |
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
| 27 |
|
| 28 |
-
# stopwords españolas normalizadas (compacta; puedes ampliar)
|
| 29 |
STOPWORDS = {
|
| 30 |
-
"a","
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"sean","ser","si","sí","sido","sin","sobre","su","sus","tal","tambien","también","tampoco","tan",
|
| 41 |
-
"tanta","tantas","tanto","te","tenia","tenía","tenian","tenían","tendra","tendrá","tendran","tendrán",
|
| 42 |
-
"tenemos","tengo","ti","tiene","tienen","todo","todos","tu","tus","un","una","unas","uno","unos",
|
| 43 |
-
"usted","ustedes","y","ya"
|
| 44 |
}
|
| 45 |
STOPWORDS = {strip_accents(w.lower()) for w in STOPWORDS} | {"aun"}
|
| 46 |
|
| 47 |
def clean_text(s: str) -> str:
|
| 48 |
-
if not isinstance(s, str):
|
| 49 |
-
s = "" if s is None else str(s)
|
| 50 |
s = strip_accents(s.lower())
|
| 51 |
-
s = re.sub(r"[“”„‟‹›«»—–‐-‒–—―\-]", " ", s)
|
| 52 |
-
s = re.sub(r"[^\w\s]", " ", s)
|
| 53 |
s = re.sub(r"\s+", " ", s).strip()
|
| 54 |
toks = [t for t in s.split() if t not in STOPWORDS and not t.isdigit()]
|
| 55 |
return " ".join(toks)
|
|
@@ -62,28 +57,17 @@ def catalog_tag(source_file: str) -> str:
|
|
| 62 |
return "OTRO"
|
| 63 |
|
| 64 |
def parse_code_name(codes_raw: str, text_original: str) -> Tuple[str, str]:
|
| 65 |
-
codes_raw = str(codes_raw or "")
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
if
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
if m2: name = m2.group(1).strip()
|
| 77 |
-
if code is None or name is None:
|
| 78 |
-
m1 = re.search(r"CODIGO\s*:\s*([^|]+)", text_original, flags=re.I)
|
| 79 |
-
m2 = re.search(r"NOMBRE\s*:\s*([^|]+)", text_original, flags=re.I)
|
| 80 |
-
if m1 and code is None: code = m1.group(1).strip()
|
| 81 |
-
if m2 and name is None: name = m2.group(1).strip()
|
| 82 |
-
return (code or "").strip(), (name or "").strip()
|
| 83 |
-
|
| 84 |
-
# -----------------------------
|
| 85 |
-
# Reglas duras (tu bloque)
|
| 86 |
-
# -----------------------------
|
| 87 |
REGLAS = [
|
| 88 |
{
|
| 89 |
"keywords": ["ops", "orden de prestacion de servicios", "contrato ops"],
|
|
@@ -106,8 +90,7 @@ REGLAS = [
|
|
| 106 |
{
|
| 107 |
'keywords': ["viatico", "viaticos"],
|
| 108 |
'respuesta': {
|
| 109 |
-
'CICP': ("2.3.2.02.
|
| 110 |
-
("2.3.2.02.02.010", "Servicios administrativos de apoyo"),
|
| 111 |
'CPC': ("901", "Gastos directos de la administración pública"),
|
| 112 |
'UNSPSC': ("20102301", "Gastos de viaje y manutención"),
|
| 113 |
},
|
|
@@ -208,187 +191,142 @@ REGLAS = [
|
|
| 208 |
def aplicar_reglas(query: str):
|
| 209 |
q = clean_text(query)
|
| 210 |
for r in REGLAS:
|
| 211 |
-
# Coincidencia OR por keyword (normalizado)
|
| 212 |
for kw in r["keywords"]:
|
| 213 |
pat = re.escape(kw).replace(r"\ ", r".*")
|
| 214 |
if re.search(pat, q):
|
| 215 |
df = pd.DataFrame(
|
| 216 |
-
[{"Catálogo": k, "Código": v[0], "Nombre": v[1], "Similaridad": 1.0}
|
|
|
|
| 217 |
)
|
| 218 |
-
return df, f"⚙️ Regla activada: {r['motivo']}"
|
| 219 |
return None, None
|
| 220 |
|
| 221 |
-
#
|
| 222 |
-
# Carga
|
| 223 |
-
#
|
| 224 |
VECTOR = None
|
| 225 |
MATRIX = None
|
| 226 |
INDEX = None
|
| 227 |
|
| 228 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 229 |
-
from sklearn.utils.validation import check_is_fitted
|
| 230 |
-
from sklearn.exceptions import NotFittedError
|
| 231 |
-
|
| 232 |
def _is_fitted_vectorizer(vec) -> bool:
|
| 233 |
try:
|
| 234 |
-
check_is_fitted(vec, attributes=["vocabulary_"])
|
| 235 |
-
check_is_fitted(vec._tfidf, attributes=["idf_"])
|
| 236 |
return True
|
| 237 |
except Exception:
|
| 238 |
return False
|
| 239 |
|
| 240 |
def _train_and_persist_from_index(index_df: pd.DataFrame):
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
# 2) Entrenar vectorizador
|
| 248 |
-
vec = TfidfVectorizer(
|
| 249 |
-
analyzer="word",
|
| 250 |
-
token_pattern=r"(?u)\b\w+\b",
|
| 251 |
-
min_df=1,
|
| 252 |
-
max_df=0.9,
|
| 253 |
-
ngram_range=(1, 2),
|
| 254 |
-
sublinear_tf=True,
|
| 255 |
-
norm="l2",
|
| 256 |
-
)
|
| 257 |
-
X = vec.fit_transform(corpus)
|
| 258 |
-
|
| 259 |
-
# 3) Persistir
|
| 260 |
ART.mkdir(exist_ok=True, parents=True)
|
| 261 |
-
joblib.dump(vec, VEC_PATH)
|
| 262 |
-
sparse.save_npz(MAT_PATH, X)
|
| 263 |
-
|
| 264 |
return vec, X
|
| 265 |
|
| 266 |
def ensure_loaded():
|
| 267 |
-
"""Carga artefactos; si el vectorizador no está fit, reentrena desde el índice."""
|
| 268 |
global VECTOR, MATRIX, INDEX
|
| 269 |
-
|
| 270 |
if INDEX is None:
|
| 271 |
INDEX = pd.read_csv(IDX_PATH)
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
if VEC_PATH.exists():
|
| 275 |
-
try:
|
| 276 |
-
vec = joblib.load(VEC_PATH)
|
| 277 |
-
except Exception:
|
| 278 |
-
vec = None
|
| 279 |
-
|
| 280 |
-
X = None
|
| 281 |
-
if MAT_PATH.exists():
|
| 282 |
-
try:
|
| 283 |
-
X = sparse.load_npz(MAT_PATH)
|
| 284 |
-
except Exception:
|
| 285 |
-
X = None
|
| 286 |
-
|
| 287 |
if vec is None or not _is_fitted_vectorizer(vec):
|
| 288 |
vec, X = _train_and_persist_from_index(INDEX)
|
| 289 |
-
|
| 290 |
-
if
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
try:
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
# Top N GLOBAL (ya no por catálogo)
|
| 329 |
-
out = (
|
| 330 |
-
df[["Catálogo","Código","Nombre","Similaridad"]]
|
| 331 |
-
.sort_values("Similaridad", ascending=False)
|
| 332 |
-
.head(int(k))
|
| 333 |
-
.reset_index(drop=True)
|
| 334 |
-
)
|
| 335 |
-
if out.empty:
|
| 336 |
-
return pd.DataFrame(), "Sin candidatos."
|
| 337 |
-
return out, "OK"
|
| 338 |
-
except Exception as e:
|
| 339 |
-
return pd.DataFrame(), f"Error: {type(e).__name__}: {e}"
|
| 340 |
-
|
| 341 |
-
# -----------------------------
|
| 342 |
-
# Exportar a Excel
|
| 343 |
-
# -----------------------------
|
| 344 |
-
def exportar_excel(query: str, k: int) -> str:
|
| 345 |
-
"""Genera un Excel con los resultados actuales y retorna la ruta."""
|
| 346 |
-
df, _ = recomendar(query, k)
|
| 347 |
-
# Asegura algo descargable aunque no haya resultados
|
| 348 |
-
path = "/tmp/busqueda.xlsx"
|
| 349 |
-
if isinstance(df, pd.DataFrame) and not df.empty:
|
| 350 |
-
df.to_excel(path, index=False, sheet_name="Resultados")
|
| 351 |
-
else:
|
| 352 |
-
pd.DataFrame(columns=["Catálogo","Código","Nombre","Similaridad"]).to_excel(
|
| 353 |
-
path, index=False, sheet_name="Resultados"
|
| 354 |
-
)
|
| 355 |
-
return path
|
| 356 |
-
|
| 357 |
-
# -----------------------------
|
| 358 |
-
# Interfaz Gradio
|
| 359 |
-
# -----------------------------
|
| 360 |
with gr.Blocks(title="Recomendador por texto (CICP / CPC / UNSPSC)") as demo:
|
| 361 |
gr.Markdown("# Recomendador por texto (CICP / CPC / UNSPSC)\n\n_TF-IDF + reglas_")
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
)
|
| 368 |
-
k = gr.Slider(1, 30, value=10, step=1, label="Top N (global)") # ← ya no 'Top por catálogo'
|
| 369 |
with gr.Row():
|
| 370 |
btn = gr.Button("Buscar", variant="primary")
|
| 371 |
-
btn_xlsx = gr.Button("Descargar búsqueda
|
| 372 |
out = gr.Dataframe(headers=["Catálogo","Código","Nombre","Similaridad"], label="Resultados", wrap=True)
|
| 373 |
status = gr.Markdown()
|
| 374 |
file_out = gr.File(label="Archivo generado", interactive=False)
|
| 375 |
|
| 376 |
-
def
|
| 377 |
-
df, msg = recomendar(q
|
| 378 |
return df, (f"**Estado:** {msg}" if msg else "")
|
| 379 |
|
| 380 |
-
def _on_download(q
|
| 381 |
-
path =
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
# Click en botón
|
| 385 |
-
btn.click(_on_click, inputs=[query, k], outputs=[out, status])
|
| 386 |
-
|
| 387 |
-
# Buscar al presionar ENTER en el Textbox
|
| 388 |
-
query.submit(_on_click, inputs=[query, k], outputs=[out, status])
|
| 389 |
|
| 390 |
-
|
| 391 |
-
|
|
|
|
| 392 |
|
| 393 |
if __name__ == "__main__":
|
| 394 |
demo.launch()
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
import re, unicodedata
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Tuple
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import joblib
|
| 7 |
import pandas as pd
|
| 8 |
from scipy import sparse
|
| 9 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from sklearn.utils.validation import check_is_fitted
|
| 12 |
|
| 13 |
+
# =========================
|
| 14 |
+
# Config
|
| 15 |
+
# =========================
|
| 16 |
ROOT = Path(__file__).parent
|
| 17 |
ART = ROOT / "artifacts"
|
| 18 |
VEC_PATH = ART / "tfidf_vectorizer.joblib"
|
| 19 |
MAT_PATH = ART / "tfidf_matrix.npz"
|
| 20 |
IDX_PATH = ART / "doc_index.csv"
|
| 21 |
+
TOP_N = 10 # ← Top global fijo (se eliminó el control del panel)
|
| 22 |
|
| 23 |
+
# =========================
|
| 24 |
+
# Utils de texto
|
| 25 |
+
# =========================
|
| 26 |
def strip_accents(s: str) -> str:
|
| 27 |
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
| 28 |
|
|
|
|
| 29 |
STOPWORDS = {
|
| 30 |
+
"a","al","algo","algunas","algunos","ante","antes","aquel","aquella","aquellas","aquellos","aqui","así","aun","aunque",
|
| 31 |
+
"bajo","bien","cada","casi","cierta","ciertas","cierto","ciertos","como","con","contra","cual","cuales","cualquier",
|
| 32 |
+
"cualesquiera","cuyo","cuya","cuyas","cuyos","de","del","desde","donde","dos","el","ella","ellas","ellos","en","entre",
|
| 33 |
+
"era","eran","eres","es","esa","esas","ese","eso","esos","esta","estaba","estaban","estamos","estan","estar","estas",
|
| 34 |
+
"este","esto","estos","fue","fueron","ha","habia","habían","haber","hay","hasta","la","las","le","les","lo","los",
|
| 35 |
+
"mas","más","me","mi","mis","mucha","muchas","mucho","muchos","muy","nada","ni","no","nos","nosotras","nosotros",
|
| 36 |
+
"nuestra","nuestras","nuestro","nuestros","o","otra","otras","otro","otros","para","pero","poco","por","porque",
|
| 37 |
+
"que","quien","quienes","se","sea","sean","ser","si","sí","sido","sin","sobre","su","sus","tal","tambien","también",
|
| 38 |
+
"tampoco","tan","tanta","tantas","tanto","te","tenia","tenían","tendrá","tendrán","tenemos","tengo","ti","tiene",
|
| 39 |
+
"tienen","todo","todos","tu","tus","un","una","unas","uno","unos","usted","ustedes","y","ya"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
STOPWORDS = {strip_accents(w.lower()) for w in STOPWORDS} | {"aun"}
|
| 42 |
|
| 43 |
def clean_text(s: str) -> str:
|
| 44 |
+
if not isinstance(s, str): s = "" if s is None else str(s)
|
|
|
|
| 45 |
s = strip_accents(s.lower())
|
| 46 |
+
s = re.sub(r"[“”„‟‹›«»—–‐-‒–—―\-]", " ", s)
|
| 47 |
+
s = re.sub(r"[^\w\s]", " ", s)
|
| 48 |
s = re.sub(r"\s+", " ", s).strip()
|
| 49 |
toks = [t for t in s.split() if t not in STOPWORDS and not t.isdigit()]
|
| 50 |
return " ".join(toks)
|
|
|
|
| 57 |
return "OTRO"
|
| 58 |
|
| 59 |
def parse_code_name(codes_raw: str, text_original: str) -> Tuple[str, str]:
|
| 60 |
+
codes_raw = str(codes_raw or ""); text_original = str(text_original or "")
|
| 61 |
+
m = re.search(r"CODIGO;NOMBRE:\s*([^;|]+)\s*;\s*([^|]+)", codes_raw, flags=re.I) \
|
| 62 |
+
or re.search(r"CODIGO;NOMBRE:\s*([^;|]+)\s*;\s*([^|]+)", text_original, flags=re.I)
|
| 63 |
+
if m: return m.group(1).strip(), m.group(2).strip()
|
| 64 |
+
code = (re.search(r"CODIGO\s*:\s*([^|]+)", codes_raw, flags=re.I) or re.search(r"CODIGO\s*:\s*([^|]+)", text_original, flags=re.I))
|
| 65 |
+
name = (re.search(r"NOMBRE\s*:\s*([^|]+)", codes_raw, flags=re.I) or re.search(r"NOMBRE\s*:\s*([^|]+)", text_original, flags=re.I))
|
| 66 |
+
return (code.group(1).strip() if code else ""), (name.group(1).strip() if name else "")
|
| 67 |
+
|
| 68 |
+
# =========================
|
| 69 |
+
# Reglas
|
| 70 |
+
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
REGLAS = [
|
| 72 |
{
|
| 73 |
"keywords": ["ops", "orden de prestacion de servicios", "contrato ops"],
|
|
|
|
| 90 |
{
|
| 91 |
'keywords': ["viatico", "viaticos"],
|
| 92 |
'respuesta': {
|
| 93 |
+
'CICP': ("2.3.2.02.02.010", "Servicios administrativos de apoyo"),
|
|
|
|
| 94 |
'CPC': ("901", "Gastos directos de la administración pública"),
|
| 95 |
'UNSPSC': ("20102301", "Gastos de viaje y manutención"),
|
| 96 |
},
|
|
|
|
| 191 |
def aplicar_reglas(query: str):
|
| 192 |
q = clean_text(query)
|
| 193 |
for r in REGLAS:
|
|
|
|
| 194 |
for kw in r["keywords"]:
|
| 195 |
pat = re.escape(kw).replace(r"\ ", r".*")
|
| 196 |
if re.search(pat, q):
|
| 197 |
df = pd.DataFrame(
|
| 198 |
+
[{"Catálogo": k, "Código": v[0], "Nombre": v[1], "Similaridad": 1.0}
|
| 199 |
+
for k, v in r["respuesta"].items()]
|
| 200 |
)
|
| 201 |
+
return df.sort_values("Catálogo"), f"⚙️ Regla activada: {r['motivo']}"
|
| 202 |
return None, None
|
| 203 |
|
| 204 |
+
# =========================
|
| 205 |
+
# Carga/entrenamiento TF-IDF
|
| 206 |
+
# =========================
|
| 207 |
VECTOR = None
|
| 208 |
MATRIX = None
|
| 209 |
INDEX = None
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def _is_fitted_vectorizer(vec) -> bool:
|
| 212 |
try:
|
| 213 |
+
check_is_fitted(vec, attributes=["vocabulary_"]); check_is_fitted(vec._tfidf, attributes=["idf_"])
|
|
|
|
| 214 |
return True
|
| 215 |
except Exception:
|
| 216 |
return False
|
| 217 |
|
| 218 |
def _train_and_persist_from_index(index_df: pd.DataFrame):
|
| 219 |
+
corpus = (index_df["tokens_lemmatized"] if "tokens_lemmatized" in index_df.columns else
|
| 220 |
+
index_df["text_original"].fillna("").astype(str).map(clean_text))
|
| 221 |
+
vec = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b",
|
| 222 |
+
min_df=1, max_df=0.9, ngram_range=(1,2),
|
| 223 |
+
sublinear_tf=True, norm="l2")
|
| 224 |
+
X = vec.fit_transform(list(corpus))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
ART.mkdir(exist_ok=True, parents=True)
|
| 226 |
+
joblib.dump(vec, VEC_PATH); sparse.save_npz(MAT_PATH, X)
|
|
|
|
|
|
|
| 227 |
return vec, X
|
| 228 |
|
| 229 |
def ensure_loaded():
|
|
|
|
| 230 |
global VECTOR, MATRIX, INDEX
|
|
|
|
| 231 |
if INDEX is None:
|
| 232 |
INDEX = pd.read_csv(IDX_PATH)
|
| 233 |
+
vec = joblib.load(VEC_PATH) if VEC_PATH.exists() else None
|
| 234 |
+
X = sparse.load_npz(MAT_PATH) if MAT_PATH.exists() else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
if vec is None or not _is_fitted_vectorizer(vec):
|
| 236 |
vec, X = _train_and_persist_from_index(INDEX)
|
| 237 |
+
elif X is None:
|
| 238 |
+
corpus = (INDEX["tokens_lemmatized"] if "tokens_lemmatized" in INDEX.columns else
|
| 239 |
+
INDEX["text_original"].fillna("").astype(str).map(clean_text))
|
| 240 |
+
X = vec.transform(list(corpus))
|
| 241 |
+
sparse.save_npz(MAT_PATH, X)
|
| 242 |
+
VECTOR, MATRIX = vec, X
|
| 243 |
+
|
| 244 |
+
# =========================
|
| 245 |
+
# Búsqueda
|
| 246 |
+
# =========================
|
| 247 |
+
def recomendar(query: str):
|
| 248 |
+
# 1) Reglas
|
| 249 |
+
df_regla, motivo = aplicar_reglas(query)
|
| 250 |
+
if df_regla is not None:
|
| 251 |
+
return df_regla, motivo
|
| 252 |
+
|
| 253 |
+
# 2) Modelo
|
| 254 |
+
ensure_loaded()
|
| 255 |
+
q = clean_text(query)
|
| 256 |
+
if not q:
|
| 257 |
+
return pd.DataFrame(), "La consulta quedó vacía tras limpieza."
|
| 258 |
+
|
| 259 |
+
xq = VECTOR.transform([q]); sims = cosine_similarity(xq, MATRIX).flatten()
|
| 260 |
+
df = INDEX.copy()
|
| 261 |
+
df["Similaridad"] = sims
|
| 262 |
+
df["Catálogo"] = df["source_file"].apply(catalog_tag)
|
| 263 |
+
parsed = df.apply(lambda r: parse_code_name(r.get("codes_raw",""), r.get("text_original","")), axis=1)
|
| 264 |
+
df["Código"] = [c for c,_ in parsed]; df["Nombre"] = [n for _,n in parsed]
|
| 265 |
+
|
| 266 |
+
out = (df[["Catálogo","Código","Nombre","Similaridad"]]
|
| 267 |
+
.sort_values("Similaridad", ascending=False)
|
| 268 |
+
.head(TOP_N)
|
| 269 |
+
.reset_index(drop=True))
|
| 270 |
+
if out.empty: return pd.DataFrame(), "Sin candidatos."
|
| 271 |
+
return out, "OK"
|
| 272 |
+
|
| 273 |
+
# =========================
|
| 274 |
+
# Exportar (xlsx con fallback a csv)
|
| 275 |
+
# =========================
|
| 276 |
+
def exportar(query: str) -> Tuple[str, str]:
|
| 277 |
+
df, _ = recomendar(query)
|
| 278 |
+
if df is None or df.empty:
|
| 279 |
+
df = pd.DataFrame(columns=["Catálogo","Código","Nombre","Similaridad"])
|
| 280 |
+
|
| 281 |
+
# Intento 1: xlsx con openpyxl
|
| 282 |
try:
|
| 283 |
+
path = "/tmp/busqueda.xlsx"
|
| 284 |
+
with pd.ExcelWriter(path, engine="openpyxl") as w:
|
| 285 |
+
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 286 |
+
return path, "Archivo Excel (.xlsx) generado."
|
| 287 |
+
except Exception:
|
| 288 |
+
# Intento 2: xlsx con xlsxwriter
|
| 289 |
+
try:
|
| 290 |
+
import xlsxwriter # noqa: F401
|
| 291 |
+
path = "/tmp/busqueda.xlsx"
|
| 292 |
+
with pd.ExcelWriter(path, engine="xlsxwriter") as w:
|
| 293 |
+
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 294 |
+
return path, "Archivo Excel (.xlsx) generado (xlsxwriter)."
|
| 295 |
+
except Exception:
|
| 296 |
+
# Fallback: CSV
|
| 297 |
+
path = "/tmp/busqueda.csv"
|
| 298 |
+
df.to_csv(path, index=False)
|
| 299 |
+
return path, "openpyxl/xlsxwriter no disponibles: se generó CSV."
|
| 300 |
+
|
| 301 |
+
# =========================
|
| 302 |
+
# UI
|
| 303 |
+
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
with gr.Blocks(title="Recomendador por texto (CICP / CPC / UNSPSC)") as demo:
|
| 305 |
gr.Markdown("# Recomendador por texto (CICP / CPC / UNSPSC)\n\n_TF-IDF + reglas_")
|
| 306 |
+
query = gr.Textbox(
|
| 307 |
+
label="Descripción técnica",
|
| 308 |
+
placeholder="reactivos de laboratorio para cromatografía hplc",
|
| 309 |
+
lines=3
|
| 310 |
+
)
|
|
|
|
|
|
|
| 311 |
with gr.Row():
|
| 312 |
btn = gr.Button("Buscar", variant="primary")
|
| 313 |
+
btn_xlsx = gr.Button("Descargar búsqueda")
|
| 314 |
out = gr.Dataframe(headers=["Catálogo","Código","Nombre","Similaridad"], label="Resultados", wrap=True)
|
| 315 |
status = gr.Markdown()
|
| 316 |
file_out = gr.File(label="Archivo generado", interactive=False)
|
| 317 |
|
| 318 |
+
def _on_search(q):
|
| 319 |
+
df, msg = recomendar(q)
|
| 320 |
return df, (f"**Estado:** {msg}" if msg else "")
|
| 321 |
|
| 322 |
+
def _on_download(q):
|
| 323 |
+
path, info = exportar(q)
|
| 324 |
+
# mostramos mensaje en status también
|
| 325 |
+
return path, f"**Descarga:** {info}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
+
btn.click(_on_search, inputs=[query], outputs=[out, status])
|
| 328 |
+
query.submit(_on_search, inputs=[query], outputs=[out, status]) # ← buscar con ENTER
|
| 329 |
+
btn_xlsx.click(_on_download, inputs=[query], outputs=[file_out, status])
|
| 330 |
|
| 331 |
if __name__ == "__main__":
|
| 332 |
demo.launch()
|