Madras1 commited on
Commit
8cf9cdd
·
verified ·
1 Parent(s): f23dc10

Upload 6 files

Browse files
Files changed (1) hide show
  1. app.py +34 -14
app.py CHANGED
@@ -202,25 +202,41 @@ def get_csv_columns(file_bytes: bytes) -> List[str]:
202
  return df.columns.tolist()
203
 
204
 
205
-
206
- def processar_pipeline(textos: List[str]) -> (pd.DataFrame, np.ndarray):
207
- logging.info(f"Iniciando pipeline para {len(textos)} textos...")
 
 
 
 
 
 
208
  model = load_retriever()
209
 
210
  # 1. Embeddings
211
  embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
212
 
213
- # 2. UMAP
214
- reducer = umap.UMAP(n_components=3, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, metric="cosine", random_state=42)
 
 
 
215
  emb_3d = reducer.fit_transform(embeddings)
216
  emb_3d = StandardScaler().fit_transform(emb_3d)
217
 
218
- # 3. HDBSCAN
219
- num_textos = len(textos)
220
- min_size = max(10, int(num_textos * 0.02))
221
- logging.info(f"HDBSCAN min_cluster_size: {min_size}")
 
 
 
 
 
 
 
222
 
223
- clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size)
224
  clusters = clusterer.fit_predict(emb_3d)
225
 
226
  # 4. DataFrame
@@ -591,8 +607,12 @@ async def search_web_api(
591
  if not textos:
592
  return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
593
 
594
- # Processar através do pipeline existente
595
- df, embeddings = processar_pipeline(textos)
 
 
 
 
596
 
597
  # Criar job e cachear
598
  job_id = str(uuid.uuid4())
@@ -617,8 +637,8 @@ async def search_web_api(
617
  },
618
  "metrics": metricas_globais,
619
  "cluster_analysis": analise_por_cluster_tfidf,
620
- "plot_data": df[["x", "y", "z", "cluster", "full_text"]].to_dict("records"),
621
- "sources": sources # URLs originais
622
  }
623
 
624
  except Exception as e:
 
202
  return df.columns.tolist()
203
 
204
 
205
+ def processar_pipeline(textos: List[str], small_dataset: bool = False) -> (pd.DataFrame, np.ndarray):
206
+ """
207
+ Processa textos através do pipeline de embeddings, UMAP e HDBSCAN.
208
+
209
+ Args:
210
+ textos: Lista de textos
211
+ small_dataset: Se True, usa parâmetros otimizados para datasets pequenos (Tavily)
212
+ """
213
+ logging.info(f"Iniciando pipeline para {len(textos)} textos (small_dataset={small_dataset})...")
214
  model = load_retriever()
215
 
216
  # 1. Embeddings
217
  embeddings = model.encode(textos, batch_size=BATCH_SIZE, show_progress_bar=False, convert_to_numpy=True)
218
 
219
+ # 2. UMAP - ajustar n_neighbors para datasets pequenos
220
+ num_textos = len(textos)
221
+ n_neighbors = min(15, max(3, num_textos - 1)) if small_dataset else UMAP_N_NEIGHBORS
222
+
223
+ reducer = umap.UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=0.0, metric="cosine", random_state=42)
224
  emb_3d = reducer.fit_transform(embeddings)
225
  emb_3d = StandardScaler().fit_transform(emb_3d)
226
 
227
+ # 3. HDBSCAN - parâmetros adaptativos
228
+ if small_dataset:
229
+ # Para Tavily (10-50 docs): clusters menores, mais agressivo
230
+ min_size = max(2, int(num_textos * 0.1)) # mínimo 2, 10% do dataset
231
+ min_samples = 1 # permite clusters mais esparsos
232
+ else:
233
+ # Para datasets grandes: comportamento padrão
234
+ min_size = max(10, int(num_textos * 0.02))
235
+ min_samples = None # usa default do HDBSCAN
236
+
237
+ logging.info(f"HDBSCAN: min_cluster_size={min_size}, min_samples={min_samples}")
238
 
239
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, min_samples=min_samples)
240
  clusters = clusterer.fit_predict(emb_3d)
241
 
242
  # 4. DataFrame
 
607
  if not textos:
608
  return {"error": "Resultados sem conteúdo válido.", "results_count": 0}
609
 
610
+ # Processar através do pipeline com parâmetros para datasets pequenos
611
+ df, embeddings = processar_pipeline(textos, small_dataset=True)
612
+
613
+ # Adicionar URL de origem a cada ponto
614
+ df["source_url"] = [sources[i]["url"] if i < len(sources) else "" for i in range(len(df))]
615
+ df["source_title"] = [sources[i]["title"] if i < len(sources) else "" for i in range(len(df))]
616
 
617
  # Criar job e cachear
618
  job_id = str(uuid.uuid4())
 
637
  },
638
  "metrics": metricas_globais,
639
  "cluster_analysis": analise_por_cluster_tfidf,
640
+ "plot_data": df[["x", "y", "z", "cluster", "full_text", "source_url", "source_title"]].to_dict("records"),
641
+ "sources": sources
642
  }
643
 
644
  except Exception as e: