import os import random import httpx if not hasattr(httpx, "AsyncHTTPTransport"): from httpx._transports.asgi import ASGITransport httpx.AsyncHTTPTransport = ASGITransport import httpcore if not hasattr(httpcore, "AsyncHTTPTransport"): httpcore.SyncHTTPTransport = getattr(httpcore, "AsyncConnectionPool") from httpx import ReadTimeout import joblib import re import pandas as pd import textwrap from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from googletrans import Translator import plotly.express as px import gradio as gr from gradio.themes import Soft # --- Load Keywords Dynamically --- KEYWORD_FILE = os.path.join("keywords", "large.txt") with open(KEYWORD_FILE, encoding="utf-8") as f: ukraine_keywords = set(line.strip() for line in f if line.strip()) _mask_re = re.compile( r"(" + r"|".join(re.escape(w) for w in ukraine_keywords) + r")", re.IGNORECASE ) def mask_kw(text: str) -> str: return _mask_re.sub("[MASK]", text) # --- Model & Classifier Setup (load once) --- model = SentenceTransformer("sbertmodel/") bundle = joblib.load("telegram_classifier.joblib") war_clf = bundle["war_clf"] centroid = bundle["centroid"] SIM_THRESHOLD = 0.7 translator = Translator() # --- Fetch / Preview --- BASE_URL = "https://telegrasc.fly.dev" def preview_messages(limit: int = 20, start: int = 0): total = limit + start resp = httpx.get(f"{BASE_URL}/messages?limit={total}", timeout=10.0) resp.raise_for_status() msgs = resp.json() texts = [m.get("text", "") for m in msgs][start : start + limit] return pd.DataFrame({"Raw message": texts}) # --- Group & Plot (batched) --- def group_and_plot(df: pd.DataFrame): raw_texts = df["Raw message"].tolist() # 1) Batch translate trans_objs = translator.translate(raw_texts, dest="en") translations = [t.text for t in trans_objs] # 2) Mask & embed in batch masked = [mask_kw(t) for t in raw_texts] embeddings = model.encode(masked, batch_size=32, convert_to_numpy=True) # 3) Predict probabilities and similarities in bulk probs = war_clf.predict_proba(embeddings)[:, 1] sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten() has_kw = [_mask_re.search(t) is not None for t in raw_texts] preds = [ kw or (p > 0.5) or (s > SIM_THRESHOLD) for kw, p, s in zip(has_kw, probs, sims) ] # 4) Split positives/negatives pos_texts = [t for t, p in zip(translations, preds) if p] neg_texts = [t for t, p in zip(translations, preds) if not p] df_pos = pd.DataFrame({"Positive (War-related)": pos_texts}) df_neg = pd.DataFrame({"Negative (Not war-related)": neg_texts}) # 5) Prepare plot DataFrame df_plot = pd.DataFrame({ "index": list(range(len(sims))), "similarity": sims, "war_pred": preds, "message": translations }) df_plot["wrapped"] = df_plot["message"].apply( lambda m: "
".join(textwrap.wrap(m, width=80)) ) fig = px.scatter( df_plot, x="index", y="similarity", color=df_plot["war_pred"].map({True: "Inside", False: "Outside"}), custom_data=["wrapped"], labels={"war_pred": "Prediction", "similarity": "Cosine Similarity"}, ) fig.update_traces( hovertemplate="%{customdata[0]}", hoverlabel=dict(align="left") ) fig.add_hline(y=SIM_THRESHOLD, line_dash="dash", annotation_text="Threshold") fig.update_layout(margin=dict(l=20, r=20, t=30, b=20)) return df_pos, df_neg, fig # --- Random Single Message (smaller fetch) --- def fetch_random_and_classify(): try: resp = httpx.get(f"{BASE_URL}/messages?limit=100", timeout=15.0) resp.raise_for_status() except ReadTimeout: resp = httpx.get(f"{BASE_URL}/messages?limit=50", timeout=30.0) resp.raise_for_status() msgs = resp.json() msg = random.choice(msgs) text = msg.get("text", "") translation = translator.translate(text, dest="en").text emb = model.encode([mask_kw(text)], convert_to_numpy=True)[0] war_prob = float(war_clf.predict_proba([emb])[0][1]) sim_cent = float(cosine_similarity([emb], [centroid])[0][0]) if _mask_re.search(text): war_pred = True else: war_pred = (war_prob > 0.5) or (sim_cent > SIM_THRESHOLD) label = "Inside (War-related)" if war_pred else "Outside (Not war-related)" return text, translation, label, round(sim_cent, 3) # --- Build Responsive, Themed Interface --- def build_interface(): responsive_css = """ .gradio-container { width: 90%; max-width: 100%; margin: 0 auto; padding: 1rem; } .slider-row { display: grid; grid-template-columns: 1fr 1fr; gap: 2%; } @media (max-width: 768px) { .gradio-container { width: 98%; padding: 0.5rem; } .slider-row { grid-template-columns: 1fr; } .gr-button, .gr-slider, .gr-textbox, .gr-dropdown { width: 100% !important; } .gr-dataframe-container { overflow-x: auto; } .gr-plot { width: 100% !important; height: auto !important; } } """ theme = Soft( primary_hue="blue", secondary_hue="teal", spacing_size="lg", radius_size="md" ) with gr.Blocks(css=responsive_css, theme=theme) as demo: gr.Markdown("## Telegram Scraper & War/Ukraine Classifier") with gr.Tab("Bulk Preview"): with gr.Row(elem_classes="slider-row"): limit_slider = gr.Slider(1, 100, value=20, step=1, label="Fetch Count") start_slider = gr.Slider(0, 500, value=0, step=1, label="Offset") fetch_btn = gr.Button("Load Messages", variant="primary") raw_table = gr.Dataframe( type="pandas", headers=["Raw message"], label="Raw Scraped Messages" ) fetch_btn.click( fn=preview_messages, inputs=[limit_slider, start_slider], outputs=[raw_table], ) with gr.Tab("Bulk Group & Visualize"): group_btn = gr.Button("Group & Plot", variant="primary") pos_table = gr.Dataframe( type="pandas", label="Positives (War-related)", wrap=True ) neg_table = gr.Dataframe( type="pandas", label="Negatives (Not war-related)", wrap=True ) plot_output = gr.Plot(label="Similarity Scatter") group_btn.click( fn=group_and_plot, inputs=[raw_table], outputs=[pos_table, neg_table, plot_output], ) with gr.Tab("Random Message"): random_btn = gr.Button("Fetch Random", variant="primary") raw_out = gr.Textbox(label="Raw Message", interactive=False) trans_out = gr.Textbox(label="Translated", interactive=False) pred_out = gr.Textbox(label="Prediction", interactive=False) sim_out = gr.Textbox(label="Similarity", interactive=False) random_btn.click( fn=fetch_random_and_classify, inputs=None, outputs=[raw_out, trans_out, pred_out, sim_out], ) return demo if __name__ == "__main__": build_interface().launch()