telegrukr / app.py
Norvoke's picture
Upload folder using huggingface_hub
426ba5d verified
import os
import random
import httpx
if not hasattr(httpx, "AsyncHTTPTransport"):
from httpx._transports.asgi import ASGITransport
httpx.AsyncHTTPTransport = ASGITransport
import httpcore
if not hasattr(httpcore, "AsyncHTTPTransport"):
httpcore.SyncHTTPTransport = getattr(httpcore, "AsyncConnectionPool")
from httpx import ReadTimeout
import joblib
import re
import pandas as pd
import textwrap
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from googletrans import Translator
import plotly.express as px
import gradio as gr
from gradio.themes import Soft
# --- Load Keywords Dynamically ---
KEYWORD_FILE = os.path.join("keywords", "large.txt")
with open(KEYWORD_FILE, encoding="utf-8") as f:
ukraine_keywords = set(line.strip() for line in f if line.strip())
_mask_re = re.compile(
r"(" + r"|".join(re.escape(w) for w in ukraine_keywords) + r")",
re.IGNORECASE
)
def mask_kw(text: str) -> str:
return _mask_re.sub("[MASK]", text)
# --- Model & Classifier Setup (load once) ---
model = SentenceTransformer("sbertmodel/")
bundle = joblib.load("telegram_classifier.joblib")
war_clf = bundle["war_clf"]
centroid = bundle["centroid"]
SIM_THRESHOLD = 0.7
translator = Translator()
# --- Fetch / Preview ---
BASE_URL = "https://telegrasc.fly.dev"
def preview_messages(limit: int = 20, start: int = 0):
total = limit + start
resp = httpx.get(f"{BASE_URL}/messages?limit={total}", timeout=10.0)
resp.raise_for_status()
msgs = resp.json()
texts = [m.get("text", "") for m in msgs][start : start + limit]
return pd.DataFrame({"Raw message": texts})
# --- Group & Plot (batched) ---
def group_and_plot(df: pd.DataFrame):
raw_texts = df["Raw message"].tolist()
# 1) Batch translate
trans_objs = translator.translate(raw_texts, dest="en")
translations = [t.text for t in trans_objs]
# 2) Mask & embed in batch
masked = [mask_kw(t) for t in raw_texts]
embeddings = model.encode(masked, batch_size=32, convert_to_numpy=True)
# 3) Predict probabilities and similarities in bulk
probs = war_clf.predict_proba(embeddings)[:, 1]
sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
has_kw = [_mask_re.search(t) is not None for t in raw_texts]
preds = [
kw or (p > 0.5) or (s > SIM_THRESHOLD)
for kw, p, s in zip(has_kw, probs, sims)
]
# 4) Split positives/negatives
pos_texts = [t for t, p in zip(translations, preds) if p]
neg_texts = [t for t, p in zip(translations, preds) if not p]
df_pos = pd.DataFrame({"Positive (War-related)": pos_texts})
df_neg = pd.DataFrame({"Negative (Not war-related)": neg_texts})
# 5) Prepare plot DataFrame
df_plot = pd.DataFrame({
"index": list(range(len(sims))),
"similarity": sims,
"war_pred": preds,
"message": translations
})
df_plot["wrapped"] = df_plot["message"].apply(
lambda m: "<br>".join(textwrap.wrap(m, width=80))
)
fig = px.scatter(
df_plot,
x="index",
y="similarity",
color=df_plot["war_pred"].map({True: "Inside", False: "Outside"}),
custom_data=["wrapped"],
labels={"war_pred": "Prediction", "similarity": "Cosine Similarity"},
)
fig.update_traces(
hovertemplate="%{customdata[0]}<extra></extra>",
hoverlabel=dict(align="left")
)
fig.add_hline(y=SIM_THRESHOLD, line_dash="dash",
annotation_text="Threshold")
fig.update_layout(margin=dict(l=20, r=20, t=30, b=20))
return df_pos, df_neg, fig
# --- Random Single Message (smaller fetch) ---
def fetch_random_and_classify():
try:
resp = httpx.get(f"{BASE_URL}/messages?limit=100", timeout=15.0)
resp.raise_for_status()
except ReadTimeout:
resp = httpx.get(f"{BASE_URL}/messages?limit=50", timeout=30.0)
resp.raise_for_status()
msgs = resp.json()
msg = random.choice(msgs)
text = msg.get("text", "")
translation = translator.translate(text, dest="en").text
emb = model.encode([mask_kw(text)], convert_to_numpy=True)[0]
war_prob = float(war_clf.predict_proba([emb])[0][1])
sim_cent = float(cosine_similarity([emb], [centroid])[0][0])
if _mask_re.search(text):
war_pred = True
else:
war_pred = (war_prob > 0.5) or (sim_cent > SIM_THRESHOLD)
label = "Inside (War-related)" if war_pred else "Outside (Not war-related)"
return text, translation, label, round(sim_cent, 3)
# --- Build Responsive, Themed Interface ---
def build_interface():
responsive_css = """
.gradio-container {
width: 90%;
max-width: 100%;
margin: 0 auto;
padding: 1rem;
}
.slider-row {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 2%;
}
@media (max-width: 768px) {
.gradio-container {
width: 98%;
padding: 0.5rem;
}
.slider-row {
grid-template-columns: 1fr;
}
.gr-button, .gr-slider, .gr-textbox, .gr-dropdown {
width: 100% !important;
}
.gr-dataframe-container {
overflow-x: auto;
}
.gr-plot {
width: 100% !important;
height: auto !important;
}
}
"""
theme = Soft(
primary_hue="blue",
secondary_hue="teal",
spacing_size="lg",
radius_size="md"
)
with gr.Blocks(css=responsive_css, theme=theme) as demo:
gr.Markdown("## Telegram Scraper & War/Ukraine Classifier")
with gr.Tab("Bulk Preview"):
with gr.Row(elem_classes="slider-row"):
limit_slider = gr.Slider(1, 100, value=20, step=1,
label="Fetch Count")
start_slider = gr.Slider(0, 500, value=0, step=1,
label="Offset")
fetch_btn = gr.Button("Load Messages", variant="primary")
raw_table = gr.Dataframe(
type="pandas",
headers=["Raw message"],
label="Raw Scraped Messages"
)
fetch_btn.click(
fn=preview_messages,
inputs=[limit_slider, start_slider],
outputs=[raw_table],
)
with gr.Tab("Bulk Group & Visualize"):
group_btn = gr.Button("Group & Plot", variant="primary")
pos_table = gr.Dataframe(
type="pandas",
label="Positives (War-related)",
wrap=True
)
neg_table = gr.Dataframe(
type="pandas",
label="Negatives (Not war-related)",
wrap=True
)
plot_output = gr.Plot(label="Similarity Scatter")
group_btn.click(
fn=group_and_plot,
inputs=[raw_table],
outputs=[pos_table, neg_table, plot_output],
)
with gr.Tab("Random Message"):
random_btn = gr.Button("Fetch Random", variant="primary")
raw_out = gr.Textbox(label="Raw Message", interactive=False)
trans_out = gr.Textbox(label="Translated", interactive=False)
pred_out = gr.Textbox(label="Prediction", interactive=False)
sim_out = gr.Textbox(label="Similarity", interactive=False)
random_btn.click(
fn=fetch_random_and_classify,
inputs=None,
outputs=[raw_out, trans_out, pred_out, sim_out],
)
return demo
if __name__ == "__main__":
build_interface().launch()