Spaces:
Build error
Build error
Commit ·
67890fd
1
Parent(s): b3c96bc
1.22 print debug
Browse files
app.py
CHANGED
|
@@ -186,51 +186,30 @@ class NewsProcessor:
|
|
| 186 |
if df.empty:
|
| 187 |
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
| 188 |
|
| 189 |
-
df = df.
|
| 190 |
|
| 191 |
-
# First, filter out news where the company isn't the main subject
|
| 192 |
-
relevance_results = []
|
| 193 |
-
for idx, row in df.iterrows():
|
| 194 |
-
title = row['title'] if 'title' in row else ''
|
| 195 |
-
is_main, score = self.is_company_main_subject(title, row['text'], row['company'])
|
| 196 |
-
if is_main:
|
| 197 |
-
relevance_results.append({
|
| 198 |
-
'idx': idx,
|
| 199 |
-
'relevance_score': score
|
| 200 |
-
})
|
| 201 |
-
|
| 202 |
-
if not relevance_results:
|
| 203 |
-
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
| 204 |
-
|
| 205 |
-
relevant_indices = [r['idx'] for r in relevance_results]
|
| 206 |
-
relevance_scores = {r['idx']: r['relevance_score'] for r in relevance_results}
|
| 207 |
-
|
| 208 |
-
df_filtered = df.loc[relevant_indices].copy()
|
| 209 |
-
df_filtered['relevance_score'] = df_filtered.index.map(relevance_scores)
|
| 210 |
-
|
| 211 |
-
# Continue with clustering logic...
|
| 212 |
clusters = []
|
| 213 |
processed = set()
|
| 214 |
|
| 215 |
-
for
|
| 216 |
-
if
|
| 217 |
continue
|
| 218 |
|
| 219 |
-
row1 =
|
| 220 |
-
cluster = [
|
| 221 |
-
processed.add(
|
| 222 |
|
| 223 |
if not pd.isna(row1['text']):
|
| 224 |
text1_embedding = self.encode_text(row1['text'])
|
| 225 |
|
| 226 |
if progress_bar:
|
| 227 |
-
progress_bar.progress(len(processed) / len(
|
| 228 |
|
| 229 |
-
for
|
| 230 |
-
if
|
| 231 |
continue
|
| 232 |
|
| 233 |
-
row2 =
|
| 234 |
if pd.isna(row2['text']):
|
| 235 |
continue
|
| 236 |
|
|
@@ -242,12 +221,12 @@ class NewsProcessor:
|
|
| 242 |
similarity = np.dot(text1_embedding, text2_embedding)
|
| 243 |
|
| 244 |
if similarity >= self.similarity_threshold:
|
| 245 |
-
cluster.append(
|
| 246 |
-
processed.add(
|
| 247 |
|
| 248 |
clusters.append(cluster)
|
| 249 |
|
| 250 |
-
# Create result DataFrame
|
| 251 |
result_data = []
|
| 252 |
for cluster_id, cluster_indices in enumerate(clusters, 1):
|
| 253 |
cluster_rows = df.loc[cluster_indices]
|
|
@@ -256,12 +235,12 @@ class NewsProcessor:
|
|
| 256 |
'cluster_id': cluster_id,
|
| 257 |
'datetime': df.loc[idx, 'datetime'],
|
| 258 |
'company': df.loc[idx, 'company'],
|
| 259 |
-
'relevance_score': relevance_scores[idx],
|
| 260 |
'text': df.loc[idx, 'text'],
|
| 261 |
'cluster_size': len(cluster_indices)
|
| 262 |
})
|
| 263 |
|
| 264 |
-
|
|
|
|
| 265 |
|
| 266 |
class NewsDeduplicator:
|
| 267 |
def __init__(self, fuzzy_threshold=85):
|
|
@@ -322,7 +301,7 @@ def create_download_link(df: pd.DataFrame, filename: str) -> str:
|
|
| 322 |
|
| 323 |
|
| 324 |
def main():
|
| 325 |
-
st.title("кластеризуем новости v.1.
|
| 326 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 327 |
|
| 328 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
@@ -360,61 +339,54 @@ def main():
|
|
| 360 |
# Step 1: Deduplicate
|
| 361 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
| 362 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
| 363 |
-
st.write("\
|
| 364 |
-
st.write(f"
|
| 365 |
-
st.
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
dedup_df_full = df_original.loc[dedup_df.index].copy()
|
| 369 |
-
st.write(f"dedup_df_full indices: {dedup_df_full.index.tolist()}")
|
| 370 |
|
| 371 |
# Step 2: Cluster deduplicated news
|
| 372 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 373 |
result_df = processor.process_news(dedup_df, progress_bar)
|
| 374 |
-
st.write("\
|
| 375 |
-
st.write(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
-
#
|
| 378 |
indices_to_delete = set()
|
| 379 |
|
| 380 |
-
# Find rows to delete from multi-item clusters
|
| 381 |
if len(result_df) > 0:
|
| 382 |
-
|
| 383 |
-
multi_clusters = result_df[result_df['cluster_size'] > 1]['cluster_id'].unique()
|
| 384 |
-
st.write(f"\nMulti-clusters found: {multi_clusters.tolist()}")
|
| 385 |
-
|
| 386 |
-
# For each multi-item cluster
|
| 387 |
-
for cluster_id in multi_clusters:
|
| 388 |
-
st.write(f"\nProcessing cluster {cluster_id}:")
|
| 389 |
-
# Get rows in this cluster
|
| 390 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
#original_indices = dedup_df_full.index[cluster_rows.index - 1] -it was wrong!
|
| 398 |
-
st.write(f"Original indices: {original_indices.tolist()}")
|
| 399 |
-
|
| 400 |
-
# Find the row with longest text among these indices
|
| 401 |
-
text_lengths = dedup_df_full.loc[original_indices, text_column].fillna('').str.len()
|
| 402 |
-
st.write(f"Text lengths: {text_lengths.to_dict()}")
|
| 403 |
-
longest_text_idx = text_lengths.idxmax()
|
| 404 |
-
st.write(f"Longest text index: {longest_text_idx}")
|
| 405 |
-
|
| 406 |
-
# Add all other indices to delete set
|
| 407 |
-
new_indices_to_delete = set(original_indices) - {longest_text_idx}
|
| 408 |
-
indices_to_delete.update(new_indices_to_delete)
|
| 409 |
-
st.write(f"Indices to delete from this cluster: {new_indices_to_delete}")
|
| 410 |
|
| 411 |
-
st.write(
|
|
|
|
| 412 |
|
| 413 |
-
# Create final
|
| 414 |
-
declustered_df =
|
| 415 |
if indices_to_delete:
|
| 416 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
| 417 |
-
|
|
|
|
| 418 |
|
| 419 |
|
| 420 |
# Print statistics
|
|
|
|
| 186 |
if df.empty:
|
| 187 |
return pd.DataFrame(columns=['cluster_id', 'datetime', 'company', 'relevance_score', 'text', 'cluster_size'])
|
| 188 |
|
| 189 |
+
df = df.copy() # Make a copy to preserve original indices
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
clusters = []
|
| 192 |
processed = set()
|
| 193 |
|
| 194 |
+
for idx in df.index: # Iterate over original indices
|
| 195 |
+
if idx in processed:
|
| 196 |
continue
|
| 197 |
|
| 198 |
+
row1 = df.loc[idx]
|
| 199 |
+
cluster = [idx] # Store original index
|
| 200 |
+
processed.add(idx)
|
| 201 |
|
| 202 |
if not pd.isna(row1['text']):
|
| 203 |
text1_embedding = self.encode_text(row1['text'])
|
| 204 |
|
| 205 |
if progress_bar:
|
| 206 |
+
progress_bar.progress(len(processed) / len(df))
|
| 207 |
|
| 208 |
+
for other_idx in df.index: # Iterate over original indices
|
| 209 |
+
if other_idx in processed:
|
| 210 |
continue
|
| 211 |
|
| 212 |
+
row2 = df.loc[other_idx]
|
| 213 |
if pd.isna(row2['text']):
|
| 214 |
continue
|
| 215 |
|
|
|
|
| 221 |
similarity = np.dot(text1_embedding, text2_embedding)
|
| 222 |
|
| 223 |
if similarity >= self.similarity_threshold:
|
| 224 |
+
cluster.append(other_idx)
|
| 225 |
+
processed.add(other_idx)
|
| 226 |
|
| 227 |
clusters.append(cluster)
|
| 228 |
|
| 229 |
+
# Create result DataFrame preserving original indices
|
| 230 |
result_data = []
|
| 231 |
for cluster_id, cluster_indices in enumerate(clusters, 1):
|
| 232 |
cluster_rows = df.loc[cluster_indices]
|
|
|
|
| 235 |
'cluster_id': cluster_id,
|
| 236 |
'datetime': df.loc[idx, 'datetime'],
|
| 237 |
'company': df.loc[idx, 'company'],
|
|
|
|
| 238 |
'text': df.loc[idx, 'text'],
|
| 239 |
'cluster_size': len(cluster_indices)
|
| 240 |
})
|
| 241 |
|
| 242 |
+
result_df = pd.DataFrame(result_data, index=sum(clusters, [])) # Use original indices
|
| 243 |
+
return result_df
|
| 244 |
|
| 245 |
class NewsDeduplicator:
|
| 246 |
def __init__(self, fuzzy_threshold=85):
|
|
|
|
| 301 |
|
| 302 |
|
| 303 |
def main():
|
| 304 |
+
st.title("кластеризуем новости v.1.22 + print debug")
|
| 305 |
st.write("Upload Excel file with columns: company, datetime, text")
|
| 306 |
|
| 307 |
uploaded_file = st.file_uploader("Choose Excel file", type=['xlsx'])
|
|
|
|
| 339 |
# Step 1: Deduplicate
|
| 340 |
deduplicator = NewsDeduplicator(fuzzy_threshold)
|
| 341 |
dedup_df = deduplicator.deduplicate(df, progress_bar)
|
| 342 |
+
st.write("\nDeduplication Results:")
|
| 343 |
+
st.write(f"Original indices: {df.index.tolist()}")
|
| 344 |
+
st.write(f"Dedup indices: {dedup_df.index.tolist()}")
|
| 345 |
+
st.write(f"Sample from dedup_df:")
|
| 346 |
+
st.write(dedup_df[['company', 'text']].head())
|
|
|
|
|
|
|
| 347 |
|
| 348 |
# Step 2: Cluster deduplicated news
|
| 349 |
processor = NewsProcessor(similarity_threshold, time_threshold)
|
| 350 |
result_df = processor.process_news(dedup_df, progress_bar)
|
| 351 |
+
st.write("\nClustering Results:")
|
| 352 |
+
st.write(f"Result df indices: {result_df.index.tolist()}")
|
| 353 |
+
|
| 354 |
+
# Display cluster information
|
| 355 |
+
if len(result_df) > 0:
|
| 356 |
+
st.write("\nCluster Details:")
|
| 357 |
+
for cluster_id in result_df['cluster_id'].unique():
|
| 358 |
+
cluster_mask = result_df['cluster_id'] == cluster_id
|
| 359 |
+
if sum(cluster_mask) > 1: # Only show multi-item clusters
|
| 360 |
+
cluster_indices = result_df[cluster_mask].index.tolist()
|
| 361 |
+
st.write(f"\nCluster {cluster_id}:")
|
| 362 |
+
st.write(f"Indices: {cluster_indices}")
|
| 363 |
+
# Show texts for verification
|
| 364 |
+
for idx in cluster_indices:
|
| 365 |
+
text_length = len(str(dedup_df.loc[idx, 'text']))
|
| 366 |
+
st.write(f"Index {idx} - Length {text_length}:")
|
| 367 |
+
st.write(str(dedup_df.loc[idx, 'text'])[:100] + '...')
|
| 368 |
|
| 369 |
+
# Process clusters for deletion
|
| 370 |
indices_to_delete = set()
|
| 371 |
|
|
|
|
| 372 |
if len(result_df) > 0:
|
| 373 |
+
for cluster_id in result_df['cluster_id'].unique():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
cluster_mask = result_df['cluster_id'] == cluster_id
|
| 375 |
+
if sum(cluster_mask) > 1:
|
| 376 |
+
cluster_indices = result_df[cluster_mask].index.tolist()
|
| 377 |
+
text_lengths = dedup_df.loc[cluster_indices, 'text'].fillna('').str.len()
|
| 378 |
+
longest_text_idx = text_lengths.idxmax()
|
| 379 |
+
indices_to_delete.update(set(cluster_indices) - {longest_text_idx})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
st.write("\nDeletion Summary:")
|
| 382 |
+
st.write(f"Indices to delete: {sorted(list(indices_to_delete))}")
|
| 383 |
|
| 384 |
+
# Create final DataFrame
|
| 385 |
+
declustered_df = dedup_df.copy()
|
| 386 |
if indices_to_delete:
|
| 387 |
declustered_df = declustered_df.drop(index=list(indices_to_delete))
|
| 388 |
+
|
| 389 |
+
st.write(f"Final indices kept: {sorted(declustered_df.index.tolist())}")
|
| 390 |
|
| 391 |
|
| 392 |
# Print statistics
|