| import os |
| import random |
| import httpx |
| if not hasattr(httpx, "AsyncHTTPTransport"): |
| from httpx._transports.asgi import ASGITransport |
| httpx.AsyncHTTPTransport = ASGITransport |
|
|
| import httpcore |
| if not hasattr(httpcore, "AsyncHTTPTransport"): |
| httpcore.SyncHTTPTransport = getattr(httpcore, "AsyncConnectionPool") |
| from httpx import ReadTimeout |
|
|
| import joblib |
| import re |
| import pandas as pd |
| import textwrap |
|
|
| from sentence_transformers import SentenceTransformer |
| from sklearn.metrics.pairwise import cosine_similarity |
| from googletrans import Translator |
| import plotly.express as px |
|
|
| import gradio as gr |
| from gradio.themes import Soft |
|
|
| |
| KEYWORD_FILE = os.path.join("keywords", "large.txt") |
| with open(KEYWORD_FILE, encoding="utf-8") as f: |
| ukraine_keywords = set(line.strip() for line in f if line.strip()) |
| _mask_re = re.compile( |
| r"(" + r"|".join(re.escape(w) for w in ukraine_keywords) + r")", |
| re.IGNORECASE |
| ) |
|
|
| def mask_kw(text: str) -> str: |
| return _mask_re.sub("[MASK]", text) |
|
|
| |
| model = SentenceTransformer("sbertmodel/") |
| bundle = joblib.load("telegram_classifier.joblib") |
| war_clf = bundle["war_clf"] |
| centroid = bundle["centroid"] |
| SIM_THRESHOLD = 0.7 |
| translator = Translator() |
|
|
| |
| BASE_URL = "https://telegrasc.fly.dev" |
|
|
| def preview_messages(limit: int = 20, start: int = 0): |
| total = limit + start |
| resp = httpx.get(f"{BASE_URL}/messages?limit={total}", timeout=10.0) |
| resp.raise_for_status() |
| msgs = resp.json() |
| texts = [m.get("text", "") for m in msgs][start : start + limit] |
| return pd.DataFrame({"Raw message": texts}) |
|
|
| |
| def group_and_plot(df: pd.DataFrame): |
| raw_texts = df["Raw message"].tolist() |
|
|
| |
| trans_objs = translator.translate(raw_texts, dest="en") |
| translations = [t.text for t in trans_objs] |
|
|
| |
| masked = [mask_kw(t) for t in raw_texts] |
| embeddings = model.encode(masked, batch_size=32, convert_to_numpy=True) |
|
|
| |
| probs = war_clf.predict_proba(embeddings)[:, 1] |
| sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten() |
| has_kw = [_mask_re.search(t) is not None for t in raw_texts] |
| preds = [ |
| kw or (p > 0.5) or (s > SIM_THRESHOLD) |
| for kw, p, s in zip(has_kw, probs, sims) |
| ] |
|
|
| |
| pos_texts = [t for t, p in zip(translations, preds) if p] |
| neg_texts = [t for t, p in zip(translations, preds) if not p] |
| df_pos = pd.DataFrame({"Positive (War-related)": pos_texts}) |
| df_neg = pd.DataFrame({"Negative (Not war-related)": neg_texts}) |
|
|
| |
| df_plot = pd.DataFrame({ |
| "index": list(range(len(sims))), |
| "similarity": sims, |
| "war_pred": preds, |
| "message": translations |
| }) |
| df_plot["wrapped"] = df_plot["message"].apply( |
| lambda m: "<br>".join(textwrap.wrap(m, width=80)) |
| ) |
|
|
| fig = px.scatter( |
| df_plot, |
| x="index", |
| y="similarity", |
| color=df_plot["war_pred"].map({True: "Inside", False: "Outside"}), |
| custom_data=["wrapped"], |
| labels={"war_pred": "Prediction", "similarity": "Cosine Similarity"}, |
| ) |
| fig.update_traces( |
| hovertemplate="%{customdata[0]}<extra></extra>", |
| hoverlabel=dict(align="left") |
| ) |
| fig.add_hline(y=SIM_THRESHOLD, line_dash="dash", |
| annotation_text="Threshold") |
| fig.update_layout(margin=dict(l=20, r=20, t=30, b=20)) |
|
|
| return df_pos, df_neg, fig |
|
|
| |
| def fetch_random_and_classify(): |
| try: |
| resp = httpx.get(f"{BASE_URL}/messages?limit=100", timeout=15.0) |
| resp.raise_for_status() |
| except ReadTimeout: |
| resp = httpx.get(f"{BASE_URL}/messages?limit=50", timeout=30.0) |
| resp.raise_for_status() |
|
|
| msgs = resp.json() |
| msg = random.choice(msgs) |
| text = msg.get("text", "") |
| translation = translator.translate(text, dest="en").text |
|
|
| emb = model.encode([mask_kw(text)], convert_to_numpy=True)[0] |
| war_prob = float(war_clf.predict_proba([emb])[0][1]) |
| sim_cent = float(cosine_similarity([emb], [centroid])[0][0]) |
| if _mask_re.search(text): |
| war_pred = True |
| else: |
| war_pred = (war_prob > 0.5) or (sim_cent > SIM_THRESHOLD) |
|
|
| label = "Inside (War-related)" if war_pred else "Outside (Not war-related)" |
| return text, translation, label, round(sim_cent, 3) |
|
|
| |
| def build_interface(): |
| responsive_css = """ |
| .gradio-container { |
| width: 90%; |
| max-width: 100%; |
| margin: 0 auto; |
| padding: 1rem; |
| } |
| .slider-row { |
| display: grid; |
| grid-template-columns: 1fr 1fr; |
| gap: 2%; |
| } |
| @media (max-width: 768px) { |
| .gradio-container { |
| width: 98%; |
| padding: 0.5rem; |
| } |
| .slider-row { |
| grid-template-columns: 1fr; |
| } |
| .gr-button, .gr-slider, .gr-textbox, .gr-dropdown { |
| width: 100% !important; |
| } |
| .gr-dataframe-container { |
| overflow-x: auto; |
| } |
| .gr-plot { |
| width: 100% !important; |
| height: auto !important; |
| } |
| } |
| """ |
| theme = Soft( |
| primary_hue="blue", |
| secondary_hue="teal", |
| spacing_size="lg", |
| radius_size="md" |
| ) |
|
|
| with gr.Blocks(css=responsive_css, theme=theme) as demo: |
| gr.Markdown("## Telegram Scraper & War/Ukraine Classifier") |
|
|
| with gr.Tab("Bulk Preview"): |
| with gr.Row(elem_classes="slider-row"): |
| limit_slider = gr.Slider(1, 100, value=20, step=1, |
| label="Fetch Count") |
| start_slider = gr.Slider(0, 500, value=0, step=1, |
| label="Offset") |
| fetch_btn = gr.Button("Load Messages", variant="primary") |
| raw_table = gr.Dataframe( |
| type="pandas", |
| headers=["Raw message"], |
| label="Raw Scraped Messages" |
| ) |
| fetch_btn.click( |
| fn=preview_messages, |
| inputs=[limit_slider, start_slider], |
| outputs=[raw_table], |
| ) |
|
|
| with gr.Tab("Bulk Group & Visualize"): |
| group_btn = gr.Button("Group & Plot", variant="primary") |
| pos_table = gr.Dataframe( |
| type="pandas", |
| label="Positives (War-related)", |
| wrap=True |
| ) |
| neg_table = gr.Dataframe( |
| type="pandas", |
| label="Negatives (Not war-related)", |
| wrap=True |
| ) |
| plot_output = gr.Plot(label="Similarity Scatter") |
| group_btn.click( |
| fn=group_and_plot, |
| inputs=[raw_table], |
| outputs=[pos_table, neg_table, plot_output], |
| ) |
|
|
| with gr.Tab("Random Message"): |
| random_btn = gr.Button("Fetch Random", variant="primary") |
| raw_out = gr.Textbox(label="Raw Message", interactive=False) |
| trans_out = gr.Textbox(label="Translated", interactive=False) |
| pred_out = gr.Textbox(label="Prediction", interactive=False) |
| sim_out = gr.Textbox(label="Similarity", interactive=False) |
| random_btn.click( |
| fn=fetch_random_and_classify, |
| inputs=None, |
| outputs=[raw_out, trans_out, pred_out, sim_out], |
| ) |
|
|
| return demo |
|
|
| if __name__ == "__main__": |
| build_interface().launch() |
|
|