Spaces:
Build error
Build error
Commit ·
30c87ad
1
Parent(s): 4eab14a
1.15
Browse files
app.py
CHANGED
|
@@ -320,8 +320,9 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
| 320 |
b64 = base64.b64encode(excel_buffer.read()).decode()
|
| 321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
| 322 |
|
|
|
|
| 323 |
def main():
|
| 324 |
-
st.title("кластеризуем новости v.1.
|
| 325 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 326 |
|
| 327 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
@@ -361,63 +362,75 @@ def main():
|
|
| 361 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
| 362 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
| 363 |
|
| 364 |
-
#
|
| 365 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
# Step 2: Cluster deduplicated news
|
| 368 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 369 |
-
result_df = processor.process_news(
|
| 370 |
|
| 371 |
-
#
|
| 372 |
indices_to_delete = set()
|
| 373 |
|
| 374 |
-
#
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
-
# Create final declustered DataFrame by
|
| 389 |
-
declustered_df = dedup_df_full.
|
|
|
|
|
|
|
| 390 |
|
| 391 |
-
# Print statistics
|
| 392 |
st.success(f"""
|
| 393 |
Processing results:
|
| 394 |
- Original rows: {len(df_original)}
|
| 395 |
- After deduplication: {len(dedup_df_full)}
|
| 396 |
-
-
|
| 397 |
-
- Rows removed from clusters: {len(indices_to_delete)}
|
| 398 |
- Final rows after declustering: {len(declustered_df)}
|
| 399 |
""")
|
| 400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
# Download buttons for all results
|
| 402 |
st.subheader("Download Results")
|
| 403 |
st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
| 404 |
-
st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
|
| 405 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
st.subheader("Largest Clusters")
|
| 416 |
-
largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
|
| 417 |
-
['cluster_size', 'cluster_id', 'datetime'],
|
| 418 |
-
ascending=[False, True, True]
|
| 419 |
-
)
|
| 420 |
-
st.dataframe(largest_clusters)
|
| 421 |
|
| 422 |
except Exception as e:
|
| 423 |
st.error(f"Error: {str(e)}")
|
|
|
|
| 320 |
b64 = base64.b64encode(excel_buffer.read()).decode()
|
| 321 |
return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
|
| 322 |
|
| 323 |
+
|
| 324 |
def main():
|
| 325 |
+
st.title("кластеризуем новости v.1.15")
|
| 326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 327 |
|
| 328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
|
| 362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
| 363 |
st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
|
| 364 |
|
| 365 |
+
# Get working copy of deduplicated data with all columns
|
| 366 |
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
| 367 |
|
| 368 |
+
# Create working copy for clustering with required columns
|
| 369 |
+
working_df = dedup_df_full[[company_column, datetime_column, title_column, text_column]].copy()
|
| 370 |
+
working_df.columns = ['company', 'datetime', 'title', 'text']
|
| 371 |
+
|
| 372 |
# Step 2: Cluster deduplicated news
|
| 373 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 374 |
+
result_df = processor.process_news(working_df, progress_bar)
|
| 375 |
|
| 376 |
+
# Create a dictionary to store indices to delete for each cluster
|
| 377 |
indices_to_delete = set()
|
| 378 |
|
| 379 |
+
# Process clusters with more than 1 member
|
| 380 |
+
if not result_df.empty:
|
| 381 |
+
for cluster_id in result_df['cluster_id'].unique():
|
| 382 |
+
cluster_rows = result_df[result_df['cluster_id'] == cluster_id]
|
| 383 |
+
|
| 384 |
+
if len(cluster_rows) > 1: # Only process multi-member clusters
|
| 385 |
+
# Get indices of all rows in this cluster
|
| 386 |
+
cluster_indices = cluster_rows.index
|
| 387 |
+
|
| 388 |
+
# Find the row with the longest text
|
| 389 |
+
text_lengths = working_df.loc[cluster_indices, 'text'].str.len()
|
| 390 |
+
longest_text_idx = text_lengths.idxmax()
|
| 391 |
+
|
| 392 |
+
# Add all other indices from this cluster to deletion set
|
| 393 |
+
cluster_indices_to_delete = set(cluster_indices) - {longest_text_idx}
|
| 394 |
+
indices_to_delete.update(cluster_indices_to_delete)
|
| 395 |
|
| 396 |
+
# Create final declustered DataFrame by dropping the identified rows
|
| 397 |
+
declustered_df = dedup_df_full.copy()
|
| 398 |
+
if indices_to_delete:
|
| 399 |
+
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
| 400 |
|
| 401 |
+
# Print statistics for verification
|
| 402 |
st.success(f"""
|
| 403 |
Processing results:
|
| 404 |
- Original rows: {len(df_original)}
|
| 405 |
- After deduplication: {len(dedup_df_full)}
|
| 406 |
+
- Rows in clusters (any size): {len(result_df) if not result_df.empty else 0}
|
| 407 |
+
- Rows removed from multi-member clusters: {len(indices_to_delete)}
|
| 408 |
- Final rows after declustering: {len(declustered_df)}
|
| 409 |
""")
|
| 410 |
|
| 411 |
+
# Add debugging information
|
| 412 |
+
if not result_df.empty:
|
| 413 |
+
multi_clusters = len(result_df[result_df['cluster_size'] > 1]['cluster_id'].unique())
|
| 414 |
+
st.write(f"Number of multi-member clusters found: {multi_clusters}")
|
| 415 |
+
|
| 416 |
+
# Show cluster sizes
|
| 417 |
+
cluster_sizes = result_df['cluster_size'].value_counts().sort_index()
|
| 418 |
+
st.write("Cluster size distribution:")
|
| 419 |
+
st.write(cluster_sizes)
|
| 420 |
+
|
| 421 |
# Download buttons for all results
|
| 422 |
st.subheader("Download Results")
|
| 423 |
st.markdown(create_download_link(dedup_df_full, "deduplicated_news.xlsx"), unsafe_allow_html=True)
|
|
|
|
| 424 |
st.markdown(create_download_link(declustered_df, "declustered_news.xlsx"), unsafe_allow_html=True)
|
| 425 |
|
| 426 |
+
# Show cluster statistics if clusters were found
|
| 427 |
+
if not result_df.empty:
|
| 428 |
+
st.subheader("Largest Clusters")
|
| 429 |
+
largest_clusters = result_df[result_df['cluster_size'] > 1].sort_values(
|
| 430 |
+
['cluster_size', 'cluster_id', 'datetime'],
|
| 431 |
+
ascending=[False, True, True]
|
| 432 |
+
)
|
| 433 |
+
st.dataframe(largest_clusters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
except Exception as e:
|
| 436 |
st.error(f"Error: {str(e)}")
|