Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,7 +18,6 @@ ART = ROOT / "artifacts"
|
|
| 18 |
VEC_PATH = ART / "tfidf_vectorizer.joblib"
|
| 19 |
MAT_PATH = ART / "tfidf_matrix.npz"
|
| 20 |
IDX_PATH = ART / "doc_index.csv"
|
| 21 |
-
TOP_N = 10 # ← Top global fijo (se eliminó el control del panel)
|
| 22 |
|
| 23 |
# =========================
|
| 24 |
# Utils de texto
|
|
@@ -27,15 +26,15 @@ def strip_accents(s: str) -> str:
|
|
| 27 |
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
| 28 |
|
| 29 |
STOPWORDS = {
|
| 30 |
-
"a","al","algo","algunas","algunos","ante","antes","aquel","aquella","aquellas","aquellos","aqui","
|
| 31 |
"bajo","bien","cada","casi","cierta","ciertas","cierto","ciertos","como","con","contra","cual","cuales","cualquier",
|
| 32 |
"cualesquiera","cuyo","cuya","cuyas","cuyos","de","del","desde","donde","dos","el","ella","ellas","ellos","en","entre",
|
| 33 |
"era","eran","eres","es","esa","esas","ese","eso","esos","esta","estaba","estaban","estamos","estan","estar","estas",
|
| 34 |
-
"este","esto","estos","fue","fueron","ha","habia","
|
| 35 |
-
"mas","
|
| 36 |
"nuestra","nuestras","nuestro","nuestros","o","otra","otras","otro","otros","para","pero","poco","por","porque",
|
| 37 |
-
"que","quien","quienes","se","sea","sean","ser","si","
|
| 38 |
-
"tampoco","tan","tanta","tantas","tanto","te","tenia","
|
| 39 |
"tienen","todo","todos","tu","tus","un","una","unas","uno","unos","usted","ustedes","y","ya"
|
| 40 |
}
|
| 41 |
STOPWORDS = {strip_accents(w.lower()) for w in STOPWORDS} | {"aun"}
|
|
@@ -43,64 +42,24 @@ STOPWORDS = {strip_accents(w.lower()) for w in STOPWORDS} | {"aun"}
|
|
| 43 |
def clean_text(s: str) -> str:
|
| 44 |
if not isinstance(s, str): s = "" if s is None else str(s)
|
| 45 |
s = strip_accents(s.lower())
|
| 46 |
-
s = re.sub(r"[
|
| 47 |
s = re.sub(r"[^\w\s]", " ", s)
|
| 48 |
s = re.sub(r"\s+", " ", s).strip()
|
| 49 |
toks = [t for t in s.split() if t not in STOPWORDS and not t.isdigit()]
|
| 50 |
return " ".join(toks)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def catalog_tag(source_file: str) -> str:
|
| 53 |
s = (source_file or "").lower()
|
| 54 |
if "cicp" in s: return "CICP"
|
| 55 |
-
if "cpc"
|
| 56 |
if "unspsc" in s: return "UNSPSC"
|
| 57 |
return "OTRO"
|
| 58 |
-
|
| 59 |
-
def parse_code_name(codes_raw: str, text_original: str) -> Tuple[str, str]:
|
| 60 |
-
codes_raw = str(codes_raw or ""); text_original = str(text_original or "")
|
| 61 |
-
m = re.search(r"CODIGO;NOMBRE:\s*([^;|]+)\s*;\s*([^|]+)", codes_raw, flags=re.I) \
|
| 62 |
-
or re.search(r"CODIGO;NOMBRE:\s*([^;|]+)\s*;\s*([^|]+)", text_original, flags=re.I)
|
| 63 |
-
if m: return m.group(1).strip(), m.group(2).strip()
|
| 64 |
-
code = (re.search(r"CODIGO\s*:\s*([^|]+)", codes_raw, flags=re.I) or re.search(r"CODIGO\s*:\s*([^|]+)", text_original, flags=re.I))
|
| 65 |
-
name = (re.search(r"NOMBRE\s*:\s*([^|]+)", codes_raw, flags=re.I) or re.search(r"NOMBRE\s*:\s*([^|]+)", text_original, flags=re.I))
|
| 66 |
-
return (code.group(1).strip() if code else ""), (name.group(1).strip() if name else "")
|
| 67 |
-
|
| 68 |
-
# --- añade esto cerca de tus utilidades, debajo de parse_code_name ---
|
| 69 |
-
ORDER_CATS = ["CICP", "CPC", "UNSPSC"]
|
| 70 |
-
|
| 71 |
-
def normalize_unspsc_if_cpc_901(rows):
|
| 72 |
-
"""rows: lista de dicts [{'Catálogo','Código','Nombre','Similaridad'}]"""
|
| 73 |
-
out = []
|
| 74 |
-
for r in rows:
|
| 75 |
-
if r["Catálogo"] == "CPC" and str(r["Código"]).strip() == "901":
|
| 76 |
-
out.append({"Catálogo":"UNSPSC","Código":"N/A","Nombre":"N/A","Similaridad":1.0})
|
| 77 |
-
else:
|
| 78 |
-
out.append(r)
|
| 79 |
-
return out
|
| 80 |
-
|
| 81 |
-
def order_and_fill_one_per_catalog(df):
|
| 82 |
-
"""Garantiza 1 por catálogo (CICP,CPC,UNSPSC), con orden fijo y normalización 901->N/A."""
|
| 83 |
-
# Tomar el mejor por catálogo
|
| 84 |
-
best = (
|
| 85 |
-
df.sort_values("Similaridad", ascending=False)
|
| 86 |
-
.groupby("Catálogo", as_index=False)
|
| 87 |
-
.head(1)
|
| 88 |
-
)
|
| 89 |
-
# Pasar a lista para poder normalizar UNSPSC si CPC=901
|
| 90 |
-
rows = [{"Catálogo":r["Catálogo"], "Código":r["Código"], "Nombre":r["Nombre"], "Similaridad":r["Similaridad"]}
|
| 91 |
-
for _, r in best.iterrows()]
|
| 92 |
-
rows = normalize_unspsc_if_cpc_901(rows)
|
| 93 |
-
|
| 94 |
-
# Asegurar orden y devolver sólo los catálogos esperados
|
| 95 |
-
ordered = [r for r in rows if r["Catálogo"] in ORDER_CATS]
|
| 96 |
-
ordered.sort(key=lambda x: ORDER_CATS.index(x["Catálogo"]))
|
| 97 |
-
# Si alguno faltó, crear placeholders vacíos (opcional)
|
| 98 |
-
seen = {r["Catálogo"] for r in ordered}
|
| 99 |
-
for cat in ORDER_CATS:
|
| 100 |
-
if cat not in seen:
|
| 101 |
-
ordered.append({"Catálogo":cat, "Código":"", "Nombre":"", "Similaridad":0.0})
|
| 102 |
-
ordered.sort(key=lambda x: ORDER_CATS.index(x["Catálogo"]))
|
| 103 |
-
return pd.DataFrame(ordered, columns=["Catálogo","Código","Nombre","Similaridad"])
|
| 104 |
|
| 105 |
# =========================
|
| 106 |
# Reglas
|
|
@@ -674,22 +633,95 @@ REGLAS = [
|
|
| 674 |
},
|
| 675 |
]
|
| 676 |
|
| 677 |
-
|
| 678 |
def aplicar_reglas(query: str):
|
| 679 |
-
|
| 680 |
for r in REGLAS:
|
| 681 |
for kw in r["keywords"]:
|
| 682 |
-
|
| 683 |
-
if
|
| 684 |
-
|
|
|
|
|
|
|
| 685 |
[{"Catálogo": k, "Código": v[0], "Nombre": v[1], "Similaridad": 1.0}
|
| 686 |
for k, v in r["respuesta"].items()]
|
| 687 |
)
|
| 688 |
-
return
|
| 689 |
return None, None
|
| 690 |
|
| 691 |
# =========================
|
| 692 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
# =========================
|
| 694 |
VECTOR = None
|
| 695 |
MATRIX = None
|
|
@@ -697,14 +729,16 @@ INDEX = None
|
|
| 697 |
|
| 698 |
def _is_fitted_vectorizer(vec) -> bool:
|
| 699 |
try:
|
| 700 |
-
check_is_fitted(vec, attributes=["vocabulary_"])
|
|
|
|
| 701 |
return True
|
| 702 |
except Exception:
|
| 703 |
return False
|
| 704 |
|
| 705 |
def _train_and_persist_from_index(index_df: pd.DataFrame):
|
| 706 |
-
corpus = (index_df["tokens_lemmatized"]
|
| 707 |
-
|
|
|
|
| 708 |
vec = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b",
|
| 709 |
min_df=1, max_df=0.9, ngram_range=(1,2),
|
| 710 |
sublinear_tf=True, norm="l2")
|
|
@@ -722,8 +756,9 @@ def ensure_loaded():
|
|
| 722 |
if vec is None or not _is_fitted_vectorizer(vec):
|
| 723 |
vec, X = _train_and_persist_from_index(INDEX)
|
| 724 |
elif X is None:
|
| 725 |
-
corpus = (INDEX["tokens_lemmatized"]
|
| 726 |
-
|
|
|
|
| 727 |
X = vec.transform(list(corpus))
|
| 728 |
sparse.save_npz(MAT_PATH, X)
|
| 729 |
VECTOR, MATRIX = vec, X
|
|
@@ -735,7 +770,8 @@ def recomendar(query: str):
|
|
| 735 |
# 1) Reglas
|
| 736 |
df_regla, motivo = aplicar_reglas(query)
|
| 737 |
if df_regla is not None:
|
| 738 |
-
|
|
|
|
| 739 |
|
| 740 |
# 2) Modelo
|
| 741 |
ensure_loaded()
|
|
@@ -743,35 +779,37 @@ def recomendar(query: str):
|
|
| 743 |
if not q:
|
| 744 |
return pd.DataFrame(), "La consulta quedó vacía tras limpieza."
|
| 745 |
|
| 746 |
-
xq = VECTOR.transform([q])
|
|
|
|
| 747 |
df = INDEX.copy()
|
| 748 |
df["Similaridad"] = sims
|
| 749 |
df["Catálogo"] = df["source_file"].apply(catalog_tag)
|
| 750 |
-
parsed = df.apply(lambda r: parse_code_name(r.get("codes_raw",""), r.get("text_original","")), axis=1)
|
| 751 |
-
df["Código"] = [c for c,_ in parsed]; df["Nombre"] = [n for _,n in parsed]
|
| 752 |
|
| 753 |
-
#
|
| 754 |
-
df = df[
|
| 755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
|
|
|
|
|
|
|
| 757 |
return df_out, "OK"
|
| 758 |
|
| 759 |
# =========================
|
| 760 |
# Exportar (xlsx con fallback a csv)
|
| 761 |
# =========================
|
| 762 |
-
def exportar(query: str)
|
| 763 |
df, _ = recomendar(query)
|
| 764 |
if df is None or df.empty:
|
| 765 |
df = pd.DataFrame(columns=["Catálogo","Código","Nombre","Similaridad"])
|
| 766 |
-
|
| 767 |
-
# Intento 1: xlsx con openpyxl
|
| 768 |
try:
|
| 769 |
path = "/tmp/busqueda.xlsx"
|
| 770 |
with pd.ExcelWriter(path, engine="openpyxl") as w:
|
| 771 |
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 772 |
return path, "Archivo Excel (.xlsx) generado."
|
| 773 |
except Exception:
|
| 774 |
-
# Intento 2: xlsx con xlsxwriter
|
| 775 |
try:
|
| 776 |
import xlsxwriter # noqa: F401
|
| 777 |
path = "/tmp/busqueda.xlsx"
|
|
@@ -779,16 +817,15 @@ def exportar(query: str) -> Tuple[str, str]:
|
|
| 779 |
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 780 |
return path, "Archivo Excel (.xlsx) generado (xlsxwriter)."
|
| 781 |
except Exception:
|
| 782 |
-
# Fallback: CSV
|
| 783 |
path = "/tmp/busqueda.csv"
|
| 784 |
df.to_csv(path, index=False)
|
| 785 |
return path, "openpyxl/xlsxwriter no disponibles: se generó CSV."
|
| 786 |
|
| 787 |
# =========================
|
| 788 |
-
# UI
|
| 789 |
# =========================
|
| 790 |
with gr.Blocks(title="Recomendador de Códigos (CICP / CPC / UNSPSC)") as demo:
|
| 791 |
-
gr.Markdown("# Recomendador de Códigos (CICP / CPC / UNSPSC)
|
| 792 |
query = gr.Textbox(
|
| 793 |
label="Descripción técnica",
|
| 794 |
placeholder="reactivos de laboratorio para cromatografía hplc",
|
|
@@ -807,11 +844,10 @@ with gr.Blocks(title="Recomendador de Códigos (CICP / CPC / UNSPSC)") as demo:
|
|
| 807 |
|
| 808 |
def _on_download(q):
|
| 809 |
path, info = exportar(q)
|
| 810 |
-
# mostramos mensaje en status también
|
| 811 |
return path, f"**Descarga:** {info}"
|
| 812 |
|
| 813 |
btn.click(_on_search, inputs=[query], outputs=[out, status])
|
| 814 |
-
query.submit(_on_search, inputs=[query], outputs=[out, status])
|
| 815 |
btn_xlsx.click(_on_download, inputs=[query], outputs=[file_out, status])
|
| 816 |
|
| 817 |
if __name__ == "__main__":
|
|
|
|
| 18 |
VEC_PATH = ART / "tfidf_vectorizer.joblib"
|
| 19 |
MAT_PATH = ART / "tfidf_matrix.npz"
|
| 20 |
IDX_PATH = ART / "doc_index.csv"
|
|
|
|
| 21 |
|
| 22 |
# =========================
|
| 23 |
# Utils de texto
|
|
|
|
| 26 |
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
|
| 27 |
|
| 28 |
STOPWORDS = {
|
| 29 |
+
"a","al","algo","algunas","algunos","ante","antes","aquel","aquella","aquellas","aquellos","aqui","asi","aun","aunque",
|
| 30 |
"bajo","bien","cada","casi","cierta","ciertas","cierto","ciertos","como","con","contra","cual","cuales","cualquier",
|
| 31 |
"cualesquiera","cuyo","cuya","cuyas","cuyos","de","del","desde","donde","dos","el","ella","ellas","ellos","en","entre",
|
| 32 |
"era","eran","eres","es","esa","esas","ese","eso","esos","esta","estaba","estaban","estamos","estan","estar","estas",
|
| 33 |
+
"este","esto","estos","fue","fueron","ha","habia","habian","haber","hay","hasta","la","las","le","les","lo","los",
|
| 34 |
+
"mas","mas","me","mi","mis","mucha","muchas","mucho","muchos","muy","nada","ni","no","nos","nosotras","nosotros",
|
| 35 |
"nuestra","nuestras","nuestro","nuestros","o","otra","otras","otro","otros","para","pero","poco","por","porque",
|
| 36 |
+
"que","quien","quienes","se","sea","sean","ser","si","si","sido","sin","sobre","su","sus","tal","tambien","tambien",
|
| 37 |
+
"tampoco","tan","tanta","tantas","tanto","te","tenia","tenian","tendra","tendran","tenemos","tengo","ti","tiene",
|
| 38 |
"tienen","todo","todos","tu","tus","un","una","unas","uno","unos","usted","ustedes","y","ya"
|
| 39 |
}
|
| 40 |
STOPWORDS = {strip_accents(w.lower()) for w in STOPWORDS} | {"aun"}
|
|
|
|
| 42 |
def clean_text(s: str) -> str:
|
| 43 |
if not isinstance(s, str): s = "" if s is None else str(s)
|
| 44 |
s = strip_accents(s.lower())
|
| 45 |
+
s = re.sub(r"[“”„‟‹›«»—–‐‒–—―\-]", " ", s)
|
| 46 |
s = re.sub(r"[^\w\s]", " ", s)
|
| 47 |
s = re.sub(r"\s+", " ", s).strip()
|
| 48 |
toks = [t for t in s.split() if t not in STOPWORDS and not t.isdigit()]
|
| 49 |
return " ".join(toks)
|
| 50 |
|
| 51 |
+
def _kw_pattern(kw_norm: str) -> str:
|
| 52 |
+
# "medidor ph" -> r"\bmedidor\b.*\bph\b"
|
| 53 |
+
parts = [re.escape(p) for p in kw_norm.split()]
|
| 54 |
+
if not parts: return ""
|
| 55 |
+
return r"\b" + r".*".join(parts) + r"\b"
|
| 56 |
+
|
| 57 |
def catalog_tag(source_file: str) -> str:
|
| 58 |
s = (source_file or "").lower()
|
| 59 |
if "cicp" in s: return "CICP"
|
| 60 |
+
if "cpc" in s: return "CPC"
|
| 61 |
if "unspsc" in s: return "UNSPSC"
|
| 62 |
return "OTRO"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# =========================
|
| 65 |
# Reglas
|
|
|
|
| 633 |
},
|
| 634 |
]
|
| 635 |
|
|
|
|
| 636 |
def aplicar_reglas(query: str):
|
| 637 |
+
texto = clean_text(query)
|
| 638 |
for r in REGLAS:
|
| 639 |
for kw in r["keywords"]:
|
| 640 |
+
kw_norm = clean_text(kw)
|
| 641 |
+
if not kw_norm: continue
|
| 642 |
+
pat = _kw_pattern(kw_norm)
|
| 643 |
+
if re.search(pat, texto):
|
| 644 |
+
tmp = pd.DataFrame(
|
| 645 |
[{"Catálogo": k, "Código": v[0], "Nombre": v[1], "Similaridad": 1.0}
|
| 646 |
for k, v in r["respuesta"].items()]
|
| 647 |
)
|
| 648 |
+
return tmp, f"⚙️ Regla activada: {r['motivo']}"
|
| 649 |
return None, None
|
| 650 |
|
| 651 |
# =========================
|
| 652 |
+
# Parsing de códigos (robusto, mismo que search_tfidf.py)
|
| 653 |
+
# =========================
|
| 654 |
+
ORDER_CATS = ["CICP", "CPC", "UNSPSC"]
|
| 655 |
+
|
| 656 |
+
def _s(x) -> str:
|
| 657 |
+
"""string seguro ('' si None/NaN)"""
|
| 658 |
+
try:
|
| 659 |
+
if x is None: return ""
|
| 660 |
+
if isinstance(x, float) and x != x: # NaN
|
| 661 |
+
return ""
|
| 662 |
+
return str(x)
|
| 663 |
+
except Exception:
|
| 664 |
+
return "" if x is None else str(x)
|
| 665 |
+
|
| 666 |
+
def parse_code_name(catalogo: str, codes_raw, text_original) -> Tuple[str,str]:
|
| 667 |
+
cat = _s(catalogo).strip().upper()
|
| 668 |
+
cr = _s(codes_raw)
|
| 669 |
+
to = _s(text_original)
|
| 670 |
+
|
| 671 |
+
if cat == "UNSPSC":
|
| 672 |
+
m = re.search(r"UNSPSC:\s*([^;]+)\s*;\s*(.+)", cr, flags=re.I)
|
| 673 |
+
if m: return m.group(1).strip(), m.group(2).strip()
|
| 674 |
+
|
| 675 |
+
if cat == "CPC":
|
| 676 |
+
m = re.search(r"CPC:\s*([^;]+)\s*;\s*(.+)", cr, flags=re.I)
|
| 677 |
+
if m: return m.group(1).strip(), m.group(2).strip()
|
| 678 |
+
|
| 679 |
+
if cat == "CICP":
|
| 680 |
+
code = None
|
| 681 |
+
m1 = re.search(r"CODIGO:\s*([^\s\|;]+)", cr, flags=re.I)
|
| 682 |
+
if m1: code = m1.group(1).strip()
|
| 683 |
+
name = None
|
| 684 |
+
m2 = re.search(r"CICP:\s*([^|]+)$", to, flags=re.I)
|
| 685 |
+
if m2: name = m2.group(1).strip()
|
| 686 |
+
if code or name:
|
| 687 |
+
return _s(code).strip(), _s(name).strip()
|
| 688 |
+
|
| 689 |
+
# Fallback genérico
|
| 690 |
+
if ";" in cr:
|
| 691 |
+
parts = [p.strip() for p in cr.split(";", 2)]
|
| 692 |
+
if len(parts) >= 2:
|
| 693 |
+
return parts[-2], parts[-1]
|
| 694 |
+
return cr.strip(), (to if to else cr).strip()
|
| 695 |
+
|
| 696 |
+
def normalize_unspsc_if_cpc_901(rows):
|
| 697 |
+
"""Si el CPC seleccionado es 901, fuerza UNSPSC=N/A."""
|
| 698 |
+
out = []
|
| 699 |
+
cpc_is_901 = any(r["Catálogo"]=="CPC" and str(r["Código"]).strip()=="901" for r in rows)
|
| 700 |
+
for r in rows:
|
| 701 |
+
if r["Catálogo"]=="UNSPSC" and cpc_is_901:
|
| 702 |
+
out.append({"Catálogo":"UNSPSC","Código":"N/A","Nombre":"N/A","Similaridad":1.0})
|
| 703 |
+
else:
|
| 704 |
+
out.append(r)
|
| 705 |
+
return out
|
| 706 |
+
|
| 707 |
+
def order_and_one_per_catalog(df_like):
|
| 708 |
+
"""Top-1 por catálogo + orden CICP→CPC→UNSPSC + normalización 901."""
|
| 709 |
+
df = pd.DataFrame(df_like)
|
| 710 |
+
best = (df.sort_values("Similaridad", ascending=False)
|
| 711 |
+
.groupby("Catálogo", as_index=False)
|
| 712 |
+
.head(1))
|
| 713 |
+
rows = [{"Catálogo": r["Catálogo"], "Código": r["Código"], "Nombre": r["Nombre"],
|
| 714 |
+
"Similaridad": r["Similaridad"]} for _, r in best.iterrows()]
|
| 715 |
+
rows = normalize_unspsc_if_cpc_901(rows)
|
| 716 |
+
have = {r["Catálogo"] for r in rows}
|
| 717 |
+
for cat in ORDER_CATS:
|
| 718 |
+
if cat not in have:
|
| 719 |
+
rows.append({"Catálogo":cat,"Código":"", "Nombre":"", "Similaridad":0.0})
|
| 720 |
+
rows.sort(key=lambda r: ORDER_CATS.index(r["Catálogo"]))
|
| 721 |
+
return pd.DataFrame(rows, columns=["Catálogo","Código","Nombre","Similaridad"])
|
| 722 |
+
|
| 723 |
+
# =========================
|
| 724 |
+
# Carga/entrenamiento TF-IDF (como app (2).py)
|
| 725 |
# =========================
|
| 726 |
VECTOR = None
|
| 727 |
MATRIX = None
|
|
|
|
| 729 |
|
| 730 |
def _is_fitted_vectorizer(vec) -> bool:
|
| 731 |
try:
|
| 732 |
+
check_is_fitted(vec, attributes=["vocabulary_"])
|
| 733 |
+
check_is_fitted(vec._tfidf, attributes=["idf_"])
|
| 734 |
return True
|
| 735 |
except Exception:
|
| 736 |
return False
|
| 737 |
|
| 738 |
def _train_and_persist_from_index(index_df: pd.DataFrame):
|
| 739 |
+
corpus = (index_df["tokens_lemmatized"]
|
| 740 |
+
if "tokens_lemmatized" in index_df.columns
|
| 741 |
+
else index_df["text_original"].fillna("").astype(str).map(clean_text))
|
| 742 |
vec = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b",
|
| 743 |
min_df=1, max_df=0.9, ngram_range=(1,2),
|
| 744 |
sublinear_tf=True, norm="l2")
|
|
|
|
| 756 |
if vec is None or not _is_fitted_vectorizer(vec):
|
| 757 |
vec, X = _train_and_persist_from_index(INDEX)
|
| 758 |
elif X is None:
|
| 759 |
+
corpus = (INDEX["tokens_lemmatized"]
|
| 760 |
+
if "tokens_lemmatized" in INDEX.columns
|
| 761 |
+
else INDEX["text_original"].fillna("").astype(str).map(clean_text))
|
| 762 |
X = vec.transform(list(corpus))
|
| 763 |
sparse.save_npz(MAT_PATH, X)
|
| 764 |
VECTOR, MATRIX = vec, X
|
|
|
|
| 770 |
# 1) Reglas
|
| 771 |
df_regla, motivo = aplicar_reglas(query)
|
| 772 |
if df_regla is not None:
|
| 773 |
+
df_out = order_and_one_per_catalog(df_regla)
|
| 774 |
+
return df_out, motivo
|
| 775 |
|
| 776 |
# 2) Modelo
|
| 777 |
ensure_loaded()
|
|
|
|
| 779 |
if not q:
|
| 780 |
return pd.DataFrame(), "La consulta quedó vacía tras limpieza."
|
| 781 |
|
| 782 |
+
xq = VECTOR.transform([q])
|
| 783 |
+
sims = cosine_similarity(xq, MATRIX).flatten()
|
| 784 |
df = INDEX.copy()
|
| 785 |
df["Similaridad"] = sims
|
| 786 |
df["Catálogo"] = df["source_file"].apply(catalog_tag)
|
|
|
|
|
|
|
| 787 |
|
| 788 |
+
# Evitar NaN antes del parser
|
| 789 |
+
if "codes_raw" in df.columns: df["codes_raw"] = df["codes_raw"].fillna("")
|
| 790 |
+
if "text_original" in df.columns: df["text_original"] = df["text_original"].fillna("")
|
| 791 |
+
|
| 792 |
+
parsed = df.apply(lambda r: parse_code_name(r["Catálogo"], r.get("codes_raw",""), r.get("text_original","")), axis=1)
|
| 793 |
+
df["Código"] = [c for c,_ in parsed]
|
| 794 |
+
df["Nombre"] = [n for _,n in parsed]
|
| 795 |
|
| 796 |
+
df = df[["Catálogo","Código","Nombre","Similaridad"]]
|
| 797 |
+
df_out = order_and_one_per_catalog(df)
|
| 798 |
return df_out, "OK"
|
| 799 |
|
| 800 |
# =========================
|
| 801 |
# Exportar (xlsx con fallback a csv)
|
| 802 |
# =========================
|
| 803 |
+
def exportar(query: str):
|
| 804 |
df, _ = recomendar(query)
|
| 805 |
if df is None or df.empty:
|
| 806 |
df = pd.DataFrame(columns=["Catálogo","Código","Nombre","Similaridad"])
|
|
|
|
|
|
|
| 807 |
try:
|
| 808 |
path = "/tmp/busqueda.xlsx"
|
| 809 |
with pd.ExcelWriter(path, engine="openpyxl") as w:
|
| 810 |
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 811 |
return path, "Archivo Excel (.xlsx) generado."
|
| 812 |
except Exception:
|
|
|
|
| 813 |
try:
|
| 814 |
import xlsxwriter # noqa: F401
|
| 815 |
path = "/tmp/busqueda.xlsx"
|
|
|
|
| 817 |
df.to_excel(w, index=False, sheet_name="Resultados")
|
| 818 |
return path, "Archivo Excel (.xlsx) generado (xlsxwriter)."
|
| 819 |
except Exception:
|
|
|
|
| 820 |
path = "/tmp/busqueda.csv"
|
| 821 |
df.to_csv(path, index=False)
|
| 822 |
return path, "openpyxl/xlsxwriter no disponibles: se generó CSV."
|
| 823 |
|
| 824 |
# =========================
|
| 825 |
+
# UI (Gradio)
|
| 826 |
# =========================
|
| 827 |
with gr.Blocks(title="Recomendador de Códigos (CICP / CPC / UNSPSC)") as demo:
|
| 828 |
+
gr.Markdown("# Recomendador de Códigos (CICP / CPC / UNSPSC)")
|
| 829 |
query = gr.Textbox(
|
| 830 |
label="Descripción técnica",
|
| 831 |
placeholder="reactivos de laboratorio para cromatografía hplc",
|
|
|
|
| 844 |
|
| 845 |
def _on_download(q):
|
| 846 |
path, info = exportar(q)
|
|
|
|
| 847 |
return path, f"**Descarga:** {info}"
|
| 848 |
|
| 849 |
btn.click(_on_search, inputs=[query], outputs=[out, status])
|
| 850 |
+
query.submit(_on_search, inputs=[query], outputs=[out, status])
|
| 851 |
btn_xlsx.click(_on_download, inputs=[query], outputs=[file_out, status])
|
| 852 |
|
| 853 |
if __name__ == "__main__":
|