Spaces:
Build error
Build error
Commit ·
ccf48e2
1
Parent(s): 75f4618
1.19
Browse files
app.py
CHANGED
|
@@ -322,15 +322,15 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
| 322 |
|
| 323 |
|
| 324 |
def main():
|
| 325 |
-
st.title("кластеризуем новости v.1.
|
| 326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 327 |
|
| 328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
| 329 |
|
| 330 |
if uploaded_file:
|
| 331 |
try:
|
| 332 |
-
# Read all columns from original sheet
|
| 333 |
-
df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 334 |
st.write("Available columns:", df_original.columns.tolist())
|
| 335 |
|
| 336 |
# Create working copy with required columns
|
|
@@ -360,56 +360,50 @@ def main():
|
|
| 360 |
# Step 1: Deduplicate
|
| 361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
| 362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
| 363 |
-
dedup_df = dedup_df.reset_index(drop=True) # Reset index after deduplication
|
| 364 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
| 365 |
|
| 366 |
-
# Preserve all columns from original DataFrame in dedup_df
|
| 367 |
-
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
| 368 |
-
dedup_df.index = dedup_df_full.index # Ensure indices match
|
| 369 |
|
| 370 |
# Step 2: Cluster deduplicated news
|
| 371 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 372 |
result_df = processor.process_news(dedup_df, progress_bar)
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
if len(result_df) > 0:
|
| 375 |
-
#
|
| 376 |
-
result_df.index = dedup_df_full.index[result_df.index]
|
| 377 |
-
|
| 378 |
-
# Initialize set of indices to delete
|
| 379 |
-
indices_to_delete = set()
|
| 380 |
-
|
| 381 |
-
# Find rows to delete from multi-item clusters
|
| 382 |
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
| 383 |
|
|
|
|
| 384 |
for cluster_id in multi_clusters:
|
| 385 |
-
# Get
|
| 386 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
| 387 |
-
|
| 388 |
|
| 389 |
-
# Get their
|
| 390 |
-
|
| 391 |
|
| 392 |
-
# Find
|
|
|
|
| 393 |
longest_text_idx = text_lengths.idxmax()
|
| 394 |
|
| 395 |
# Add all other indices to delete set
|
| 396 |
-
indices_to_delete.update(set(
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
else:
|
| 403 |
-
declustered_df = dedup_df_full.copy()
|
| 404 |
-
indices_to_delete = set()
|
| 405 |
-
multi_clusters = []
|
| 406 |
|
| 407 |
# Print statistics
|
| 408 |
st.success(f"""
|
| 409 |
Processing results:
|
| 410 |
- Original rows: {len(df_original)}
|
| 411 |
- After deduplication: {len(dedup_df_full)}
|
| 412 |
-
- Multi-item clusters found: {len(multi_clusters)}
|
| 413 |
- Rows removed from clusters: {len(indices_to_delete)}
|
| 414 |
- Final rows kept: {len(declustered_df)}
|
| 415 |
""")
|
|
|
|
| 322 |
|
| 323 |
|
| 324 |
def main():
|
| 325 |
+
st.title("кластеризуем новости v.1.19")
|
| 326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 327 |
|
| 328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
| 329 |
|
| 330 |
if uploaded_file:
|
| 331 |
try:
|
| 332 |
+
# Read all columns from original sheet
|
| 333 |
+
df_original = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 334 |
st.write("Available columns:", df_original.columns.tolist())
|
| 335 |
|
| 336 |
# Create working copy with required columns
|
|
|
|
| 360 |
# Step 1: Deduplicate
|
| 361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
| 362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
|
|
|
| 363 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
| 364 |
|
| 365 |
+
# Preserve all columns from original DataFrame in dedup_df
|
| 366 |
+
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
|
|
|
| 367 |
|
| 368 |
# Step 2: Cluster deduplicated news
|
| 369 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 370 |
result_df = processor.process_news(dedup_df, progress_bar)
|
| 371 |
|
| 372 |
+
# Initialize set of indices to delete
|
| 373 |
+
indices_to_delete = set()
|
| 374 |
+
|
| 375 |
+
# Find rows to delete from multi-item clusters
|
| 376 |
if len(result_df) > 0:
|
| 377 |
+
# Get all multi-item clusters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
| 379 |
|
| 380 |
+
# For each multi-item cluster
|
| 381 |
for cluster_id in multi_clusters:
|
| 382 |
+
# Get rows in this cluster
|
| 383 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
| 384 |
+
cluster_rows = result_df[cluster_mask]
|
| 385 |
|
| 386 |
+
# Get their original indices from dedup_df_full
|
| 387 |
+
original_indices = dedup_df_full.index[cluster_rows.index - 1]
|
| 388 |
|
| 389 |
+
# Find the row with longest text among these indices
|
| 390 |
+
text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
|
| 391 |
longest_text_idx = text_lengths.idxmax()
|
| 392 |
|
| 393 |
# Add all other indices to delete set
|
| 394 |
+
indices_to_delete.update(set(original_indices) - {longest_text_idx})
|
| 395 |
+
|
| 396 |
+
# Create final declustered DataFrame by removing identified rows
|
| 397 |
+
declustered_df = dedup_df_full.copy()
|
| 398 |
+
if indices_to_delete:
|
| 399 |
+
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
# Print statistics
|
| 402 |
st.success(f"""
|
| 403 |
Processing results:
|
| 404 |
- Original rows: {len(df_original)}
|
| 405 |
- After deduplication: {len(dedup_df_full)}
|
| 406 |
+
- Multi-item clusters found: {len(multi_clusters) if len(result_df) > 0 else 0}
|
| 407 |
- Rows removed from clusters: {len(indices_to_delete)}
|
| 408 |
- Final rows kept: {len(declustered_df)}
|
| 409 |
""")
|