Spaces:

pentarosarium
/

clusters

Build error

App Files Files Community

pentarosarium commited on Nov 18, 2024

Commit

2412746

0 Parent(s):

initial commit

Browse files

Files changed (2) hide show

app.py +185 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from huggingface_hub import HfApi, InferenceClient
+from transformers import pipeline
+from datetime import datetime
+import io
+import base64
+from typing import Dict, List, Set, Tuple
+from rapidfuzz import fuzz, process
+from collections import defaultdict
+from tqdm.auto import tqdm
+# Initialize HuggingFace client with token
+@st.cache_resource
+def get_hf_client():
+    token = st.secrets["hf_token"]
+    return InferenceClient(token=token)
+@st.cache_resource
+def get_embeddings_pipeline():
+    return pipeline("feature-extraction",
+                   model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+                   token=st.secrets["hf_token"])
+class NewsProcessor:
+    def __init__(self, similarity_threshold=0.75, time_threshold=24):
+        self.client = get_hf_client()
+        self.embeddings_pipeline = get_embeddings_pipeline()
+        self.similarity_threshold = similarity_threshold
+        self.time_threshold = time_threshold
+    def encode_text(self, text):
+        embeddings = self.embeddings_pipeline(text)
+        return np.mean(embeddings[0], axis=0)
+    def process_news(self, df: pd.DataFrame, progress_bar=None) -> pd.DataFrame:
+        df['company_list'] = df['company'].str.split(' | ')
+        df = df.sort_values('datetime')
+        clusters = []
+        processed = set()
+        total_items = len(df)
+        for i, row1 in df.iterrows():
+            if i in processed:
+                continue
+            cluster = [i]
+            processed.add(i)
+            text1_embedding = self.encode_text(row1['text'])
+            if progress_bar:
+                progress_bar.progress(len(processed) / total_items)
+            for j, row2 in df.iterrows():
+                if j in processed:
+                    continue
+                time_diff = abs(pd.to_datetime(row1['datetime']) - pd.to_datetime(row2['datetime']))
+                if time_diff.total_seconds() / 3600 > self.time_threshold:
+                    continue
+                text2_embedding = self.encode_text(row2['text'])
+                similarity = np.dot(text1_embedding, text2_embedding)
+                if similarity >= self.similarity_threshold:
+                    companies_overlap = bool(set(row1['company_list']) & set(row2['company_list']))
+                    if companies_overlap:
+                        cluster.append(j)
+                        processed.add(j)
+            clusters.append(cluster)
+        return self._create_result_df(df, clusters)
+    def _create_result_df(self, df: pd.DataFrame, clusters: List[List[int]]) -> pd.DataFrame:
+        result_data = []
+        for cluster_id, cluster in enumerate(clusters, 1):
+            cluster_texts = df.iloc[cluster]
+            for idx in cluster:
+                result_data.append({
+                    'cluster_id': cluster_id,
+                    'datetime': df.iloc[idx]['datetime'],
+                    'company': ' | '.join(df.iloc[idx]['company_list']),
+                    'text': df.iloc[idx]['text'],
+                    'cluster_size': len(cluster)
+                })
+        return pd.DataFrame(result_data)
+class NewsDeduplicator:
+    def __init__(self, fuzzy_threshold=85):
+        self.fuzzy_threshold = fuzzy_threshold
+    def deduplicate(self, df: pd.DataFrame, progress_bar=None) -> pd.DataFrame:
+        seen_texts: List[str] = []
+        text_to_companies: Dict[str, Set[str]] = defaultdict(set)
+        indices_to_keep: Set[int] = set()
+        for idx, row in df.iterrows():
+            text = str(row['text'])
+            company = str(row['company'])
+            if seen_texts:
+                result = process.extractOne(
+                    text,
+                    seen_texts,
+                    scorer=fuzz.ratio,
+                    score_cutoff=self.fuzzy_threshold
+                )
+                match = result[0] if result else None
+            else:
+                match = None
+            if match:
+                text_to_companies[match].add(company)
+            else:
+                seen_texts.append(text)
+                text_to_companies[text].add(company)
+                indices_to_keep.add(idx)
+            if progress_bar:
+                progress_bar.progress((idx + 1) / len(df))
+        dedup_df = df.iloc[list(indices_to_keep)].copy()
+        for idx in indices_to_keep:
+            text = str(df.iloc[idx]['text'])
+            companies = sorted(text_to_companies[text])
+            dedup_df.at[idx, 'company'] = ' | '.join(companies)
+        return dedup_df.sort_values('datetime')
+def create_download_link(df: pd.DataFrame, filename: str) -> str:
+    excel_buffer = io.BytesIO()
+    df.to_excel(excel_buffer, index=False)
+    excel_buffer.seek(0)
+    b64 = base64.b64encode(excel_buffer.read()).decode()
+    return f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}">Download {filename}</a>'
+def main():
+    st.title("News Clustering App")
+    st.write("Upload Excel file with columns: company, datetime, text")
+    uploaded_file = st.file_uploader("Choose file", type=['xlsx'])
+    if uploaded_file:
+        df = pd.read_excel(uploaded_file)
+        st.dataframe(df.head())
+        col1, col2 = st.columns(2)
+        with col1:
+            fuzzy_threshold = st.slider("Fuzzy Match Threshold", 30, 100, 50)
+        with col2:
+            similarity_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.75)
+            time_threshold = st.slider("Time Threshold (hours)", 1, 72, 24)
+        if st.button("Process"):
+            try:
+                progress_bar = st.progress(0)
+                deduplicator = NewsDeduplicator(fuzzy_threshold)
+                dedup_df = deduplicator.deduplicate(df, progress_bar)
+                st.success(f"Removed {len(df) - len(dedup_df)} duplicates")
+                processor = NewsProcessor(similarity_threshold, time_threshold)
+                result_df = processor.process_news(dedup_df, progress_bar)
+                st.success(f"Found {result_df['cluster_id'].nunique()} clusters")
+                st.markdown(create_download_link(result_df, "clustered_news.xlsx"), unsafe_allow_html=True)
+                st.dataframe(result_df)
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+            finally:
+                progress_bar.empty()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+pandas
+numpy
+transformers
+rapidfuzz
+huggingface-hub
+openpyxl