pentarosarium commited on
Commit
30c87ad
·
1 Parent(s): 4eab14a
Files changed (1) hide show
  1. app.py +50 -37
app.py CHANGED
@@ -320,8 +320,9 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
320
  b64 = base64.b64encode(excel_buffer.read()).decode()
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
 
323
  def main():
324
- st.title("кластеризуем новости v.1.14")
325
  st.write("Upload Excel file with columns: company, datetime, text")
326
 
327
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
@@ -361,63 +362,75 @@ def main():
361
  dedup_df = deduplicator.deduplicate(df, progress_bar)
362
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
363
 
364
- # Preserve all columns from original DataFrame in dedup_df
365
  dedup_df_full = df_original.loc[dedup_df.index].copy()
366
 
 
 
 
 
367
  # Step 2: Cluster deduplicated news
368
  processor = NewsProcessor(similarity_threshold, time_threshold)
369
- result_df = processor.process_news(dedup_df, progress_bar)
370
 
371
- # Initialize set of indices to delete
372
  indices_to_delete = set()
373
 
374
- # Find rows to delete from multi-item clusters
375
- multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
376
-
377
- for cluster_id in multi_clusters:
378
- # Get all rows in this cluster
379
- cluster_indices = result_df[result_df['cluster_id'] == cluster_id].index
380
-
381
- # Find the row with longest text
382
- text_lengths = dedup_df_full.loc[cluster_indices, text_column].str.len()
383
- longest_text_idx = text_lengths.idxmax()
384
-
385
- # Add all other rows from this cluster to delete list
386
- indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
 
 
 
387
 
388
- # Create final declustered DataFrame by removing identified rows
389
- declustered_df = dedup_df_full.drop(index=list(indices_to_delete))
 
 
390
 
391
- # Print statistics
392
  st.success(f"""
393
  Processing results:
394
  - Original rows: {len(df_original)}
395
  - After deduplication: {len(dedup_df_full)}
396
- - Multi-item clusters found: {len(multi_clusters)}
397
- - Rows removed from clusters: {len(indices_to_delete)}
398
  - Final rows after declustering: {len(declustered_df)}
399
  """)
400
 
 
 
 
 
 
 
 
 
 
 
401
  # Download buttons for all results
402
  st.subheader("Download Results")
403
  st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
404
- st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
405
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
406
 
407
- st.subheader("Cluster Statistics")
408
- cluster_stats = result_df.groupby('cluster_id').agg({
409
- 'cluster_size': 'first',
410
- 'company': 'first',
411
- 'relevance_score': 'mean'
412
- }).rename(columns={'relevance_score': 'avg_relevance'})
413
- st.dataframe(cluster_stats)
414
-
415
- st.subheader("Largest Clusters")
416
- largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
417
- ['cluster_size', 'cluster_id', 'datetime'],
418
- ascending=[False, True, True]
419
- )
420
- st.dataframe(largest_clusters)
421
 
422
  except Exception as e:
423
  st.error(f"Error: {str(e)}")
 
320
  b64 = base64.b64encode(excel_buffer.read()).decode()
321
  return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
322
 
323
+
324
  def main():
325
+ st.title("кластеризуем новости v.1.15")
326
  st.write("Upload Excel file with columns: company, datetime, text")
327
 
328
  uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
 
362
  dedup_df = deduplicator.deduplicate(df, progress_bar)
363
  st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
364
 
365
+ # Get working copy of deduplicated data with all columns
366
  dedup_df_full = df_original.loc[dedup_df.index].copy()
367
 
368
+ # Create working copy for clustering with required columns
369
+ working_df = dedup_df_full[[company_column, datetime_column, title_column, text_column]].copy()
370
+ working_df.columns = ['company', 'datetime', 'title', 'text']
371
+
372
  # Step 2: Cluster deduplicated news
373
  processor = NewsProcessor(similarity_threshold, time_threshold)
374
+ result_df = processor.process_news(working_df, progress_bar)
375
 
376
+ # Create a dictionary to store indices to delete for each cluster
377
  indices_to_delete = set()
378
 
379
+ # Process clusters with more than 1 member
380
+ if not result_df.empty:
381
+ for cluster_id in result_df['cluster_id'].unique():
382
+ cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
383
+
384
+ if len(cluster_rows) > 1: # Only process multi-member clusters
385
+ # Get indices of all rows in this cluster
386
+ cluster_indices = cluster_rows.index
387
+
388
+ # Find the row with the longest text
389
+ text_lengths = working_df.loc[cluster_indices, 'text'].str.len()
390
+ longest_text_idx = text_lengths.idxmax()
391
+
392
+ # Add all other indices from this cluster to deletion set
393
+ cluster_indices_to_delete = set(cluster_indices) - {longest_text_idx}
394
+ indices_to_delete.update(cluster_indices_to_delete)
395
 
396
+ # Create final declustered DataFrame by dropping the identified rows
397
+ declustered_df = dedup_df_full.copy()
398
+ if indices_to_delete:
399
+ declustered_df = declustered_df.drop(index=list(indices_to_delete))
400
 
401
+ # Print statistics for verification
402
  st.success(f"""
403
  Processing results:
404
  - Original rows: {len(df_original)}
405
  - After deduplication: {len(dedup_df_full)}
406
+ - Rows in clusters (any size): {len(result_df) if not result_df.empty else 0}
407
+ - Rows removed from multi-member clusters: {len(indices_to_delete)}
408
  - Final rows after declustering: {len(declustered_df)}
409
  """)
410
 
411
+ # Add debugging information
412
+ if not result_df.empty:
413
+ multi_clusters = len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())
414
+ st.write(f"Number of multi-member clusters found: {multi_clusters}")
415
+
416
+ # Show cluster sizes
417
+ cluster_sizes = result_df['cluster_size'].value_counts().sort_index()
418
+ st.write("Cluster size distribution:")
419
+ st.write(cluster_sizes)
420
+
421
  # Download buttons for all results
422
  st.subheader("Download Results")
423
  st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
 
424
  st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
425
 
426
+ # Show cluster statistics if clusters were found
427
+ if not result_df.empty:
428
+ st.subheader("Largest Clusters")
429
+ largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
430
+ ['cluster_size', 'cluster_id', 'datetime'],
431
+ ascending=[False, True, True]
432
+ )
433
+ st.dataframe(largest_clusters)
 
 
 
 
 
 
434
 
435
  except Exception as e:
436
  st.error(f"Error: {str(e)}")