Upload 6 files
Browse files
app.py
CHANGED
|
@@ -202,25 +202,41 @@ def get_csv_columns(file_bytes: bytes) -> List[str]:
|
|
| 202 |
return df.columns.tolist()
|
| 203 |
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
model = load_retriever()
|
| 209 |
|
| 210 |
# 1. Embeddings
|
| 211 |
embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
|
| 212 |
|
| 213 |
-
# 2. UMAP
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
emb_3d = reducer.fit_transform(embeddings)
|
| 216 |
emb_3d = StandardScaler().fit_transform(emb_3d)
|
| 217 |
|
| 218 |
-
# 3. HDBSCAN
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
|
| 224 |
clusters = clusterer.fit_predict(emb_3d)
|
| 225 |
|
| 226 |
# 4. DataFrame
|
|
@@ -591,8 +607,12 @@ async def search_web_api(
|
|
| 591 |
if not textos:
|
| 592 |
return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
|
| 593 |
|
| 594 |
-
# Processar através do pipeline
|
| 595 |
-
df, embeddings = processar_pipeline(textos)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
# Criar job e cachear
|
| 598 |
job_id = str(uuid.uuid4())
|
|
@@ -617,8 +637,8 @@ async def search_web_api(
|
|
| 617 |
},
|
| 618 |
"metrics": metricas_globais,
|
| 619 |
"cluster_analysis": analise_por_cluster_tfidf,
|
| 620 |
-
"plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
|
| 621 |
-
"sources": sources
|
| 622 |
}
|
| 623 |
|
| 624 |
except Exception as e:
|
|
|
|
| 202 |
return df.columns.tolist()
|
| 203 |
|
| 204 |
|
| 205 |
+
def processar_pipeline(textos: List[str], small_dataset: bool = False) -> (pd.DataFrame, np.ndarray):
|
| 206 |
+
"""
|
| 207 |
+
Processa textos através do pipeline de embeddings, UMAP e HDBSCAN.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
textos: Lista de textos
|
| 211 |
+
small_dataset: Se True, usa parâmetros otimizados para datasets pequenos (Tavily)
|
| 212 |
+
"""
|
| 213 |
+
logging.info(f"Iniciando pipeline para {len(textos)} textos (small_dataset={small_dataset})...")
|
| 214 |
model = load_retriever()
|
| 215 |
|
| 216 |
# 1. Embeddings
|
| 217 |
embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
|
| 218 |
|
| 219 |
+
# 2. UMAP - ajustar n_neighbors para datasets pequenos
|
| 220 |
+
num_textos = len(textos)
|
| 221 |
+
n_neighbors = min(15, max(3, num_textos - 1)) if small_dataset else UMAP_N_NEIGHBORS
|
| 222 |
+
|
| 223 |
+
reducer = umap.UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=0.0, metric="cosine", random_state=42)
|
| 224 |
emb_3d = reducer.fit_transform(embeddings)
|
| 225 |
emb_3d = StandardScaler().fit_transform(emb_3d)
|
| 226 |
|
| 227 |
+
# 3. HDBSCAN - parâmetros adaptativos
|
| 228 |
+
if small_dataset:
|
| 229 |
+
# Para Tavily (10-50 docs): clusters menores, mais agressivo
|
| 230 |
+
min_size = max(2, int(num_textos * 0.1)) # mínimo 2, 10% do dataset
|
| 231 |
+
min_samples = 1 # permite clusters mais esparsos
|
| 232 |
+
else:
|
| 233 |
+
# Para datasets grandes: comportamento padrão
|
| 234 |
+
min_size = max(10, int(num_textos * 0.02))
|
| 235 |
+
min_samples = None # usa default do HDBSCAN
|
| 236 |
+
|
| 237 |
+
logging.info(f"HDBSCAN: min_cluster_size={min_size}, min_samples={min_samples}")
|
| 238 |
|
| 239 |
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, min_samples=min_samples)
|
| 240 |
clusters = clusterer.fit_predict(emb_3d)
|
| 241 |
|
| 242 |
# 4. DataFrame
|
|
|
|
| 607 |
if not textos:
|
| 608 |
return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
|
| 609 |
|
| 610 |
+
# Processar através do pipeline com parâmetros para datasets pequenos
|
| 611 |
+
df, embeddings = processar_pipeline(textos, small_dataset=True)
|
| 612 |
+
|
| 613 |
+
# Adicionar URL de origem a cada ponto
|
| 614 |
+
df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
|
| 615 |
+
df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
|
| 616 |
|
| 617 |
# Criar job e cachear
|
| 618 |
job_id = str(uuid.uuid4())
|
|
|
|
| 637 |
},
|
| 638 |
"metrics": metricas_globais,
|
| 639 |
"cluster_analysis": analise_por_cluster_tfidf,
|
| 640 |
+
"plot_data": df[["x", "y", "z", "cluster", "full_text", "source_url", "source_title"]].to_dict("records"),
|
| 641 |
+
"sources": sources
|
| 642 |
}
|
| 643 |
|
| 644 |
except Exception as e:
|