Urdu-Summarizer / app.py
Subayyal's picture
Update app.py
4b53dcb verified
import streamlit as st
import re
import time
from typing import List, Dict
import pandas as pd
import plotly.express as px
import numpy as np
import networkx as nx
from lime.lime_text import LimeTextExplainer
import shap
# ----------------- Streamlit Page Config -----------------
st.set_page_config(
page_title="🧠 اردو متن خلاصہ ساز",
page_icon="🧠",
layout="wide",
initial_sidebar_state="expanded"
)
# ----------------- Custom CSS -----------------
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Noto+Nastaliq+Urdu:wght@400;700&display=swap');
.urdu-text { font-family: 'Noto Nastaliq Urdu', Arial, sans-serif; direction: rtl; text-align: right; line-height: 2; }
.summary-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0; }
.sentence-bar { height: 20px; border-radius: 5px; margin-bottom:5px; }
.sentence-text { font-size: 16px; margin-bottom:3px; }
.score-text { font-size: 14px; color: #333; margin-top: 5px; direction: rtl; }
</style>
""", unsafe_allow_html=True)
# ----------------- Urdu Text Summarizer -----------------
class UrduTextSummarizer:
def __init__(self):
self.urdu_stop_words = {'اور','کا','کی','کے','میں','سے','کو','نے','ہے','ہیں','تھا','تھی','تھے',
'گا','گی','گے','کہ','جو','یہ','وہ','اس','ان','پر','کر','کرنا','کیا',
'ہو','ہوا','ہوئی','ہوئے','بھی','تو','ہی','لیے','ساتھ','بعد','پہلے'}
def tokenize(self, sentence: str) -> List[str]:
if isinstance(sentence, bytes):
sentence = sentence.decode('utf-8', errors='ignore')
elif not isinstance(sentence, str):
sentence = str(sentence)
words = re.sub(r'[۔،؟!؛:]', '', sentence).split()
return [w for w in words if w not in self.urdu_stop_words and len(w) > 2]
def extract_keywords(self, text: str) -> Dict:
words = re.sub(r'[۔،؟!؛:]', '', text).split()
word_freq = {}
for w in words:
if w not in self.urdu_stop_words and len(w) > 2:
word_freq[w] = word_freq.get(w, 0) + 1
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
return {'keywords': [{'word': w, 'frequency': f, 'importance': min(1.0, f/5)} for w, f in sorted_words]}
def get_sentence_score(self, sentences: List[str], tfidf: np.ndarray, sim_matrix: np.ndarray, i: int) -> float:
vec = tfidf[i]
norm = np.linalg.norm(vec)
avg_sim = np.mean(sim_matrix[i])
return (norm + avg_sim) / 2 if norm > 0 else 0.0
def summarize_text(self, text: str) -> Dict:
start_time = time.time()
sentences = [s.strip() for s in re.split(r'[۔!?]', text) if s.strip()]
if len(sentences) == 0:
return {'summary': '', 'sentences': [], 'keywords': {'keywords': []}, 'duration': 0.0, 'explanations': {}}
if len(sentences) < 2:
return {'summary': sentences[0], 'sentences': [{'sentence': sentences[0], 'score': 1.0, 'included': True, 'position': 1}],
'keywords': self.extract_keywords(text), 'duration': time.time() - start_time, 'explanations': {}}
# Tokenize sentences
sent_words = [self.tokenize(s) for s in sentences]
all_words = list(set(w for sw in sent_words for w in sw))
if not all_words:
summary = '۔ '.join(sentences[:3])
sentence_scores = [{'sentence': s, 'score': 0.0, 'included': i < 3, 'position': i+1} for i, s in enumerate(sentences)]
return {'summary': summary, 'sentences': sentence_scores, 'keywords': {'keywords': []}, 'duration': time.time() - start_time, 'explanations': {}}
word_to_idx = {w: i for i, w in enumerate(all_words)}
num_words = len(all_words)
num_sents = len(sentences)
# TF matrix
tf = np.zeros((num_sents, num_words))
for i, words in enumerate(sent_words):
word_count = {}
for w in words:
word_count[w] = word_count.get(w, 0) + 1
for w, count in word_count.items():
tf[i, word_to_idx[w]] = count / len(words) if len(words) > 0 else 0
# IDF
df = np.sum(tf > 0, axis=0)
idf = np.log(num_sents / (1 + df))
# TF-IDF
tfidf = tf * idf
# Similarity matrix
sim_matrix = np.zeros((num_sents, num_sents))
for i in range(num_sents):
for j in range(num_sents):
if i != j:
vec_i = tfidf[i]
vec_j = tfidf[j]
norm_i = np.linalg.norm(vec_i)
norm_j = np.linalg.norm(vec_j)
if norm_i > 0 and norm_j > 0:
sim_matrix[i, j] = np.dot(vec_i, vec_j) / (norm_i * norm_j)
# Graph and PageRank
graph = nx.from_numpy_array(sim_matrix)
try:
scores = nx.pagerank(graph, max_iter=200, tol=1e-6)
except nx.PowerIterationFailedConvergence:
scores = {i: 1.0 / num_sents for i in range(num_sents)}
# Sentence scores with position bonus
sentence_scores = []
max_score = max(scores.values()) if scores else 1.0
for i in range(num_sents):
base_score = scores.get(i, 0.0) / max_score if max_score > 0 else 0.0
position_bonus = 0.1 if i == 0 or i == num_sents - 1 else 0
final_score = base_score + position_bonus
sentence_scores.append({'sentence': sentences[i], 'score': final_score, 'included': False, 'position': i+1})
# Select top N sentences
N = max(3, min(6, int(len(sentences) * 0.3)))
sorted_scores = sorted(sentence_scores, key=lambda x: x['score'], reverse=True)[:N]
# Redundancy check
selected = []
for cand in sorted_scores:
add = True
cand_vec = tfidf[cand['position'] - 1]
for sel in selected:
sel_vec = tfidf[sel['position'] - 1]
sim = np.dot(cand_vec, sel_vec) / (np.linalg.norm(cand_vec) * np.linalg.norm(sel_vec)) if np.linalg.norm(cand_vec) > 0 and np.linalg.norm(sel_vec) > 0 else 0
if sim > 0.8:
add = False
break
if add:
selected.append(cand)
# Sort selected by original position
summary_sents = sorted(selected, key=lambda x: x['position'])
summary = '۔ '.join([s['sentence'] for s in summary_sents]) + '۔' if summary_sents else ''
# Update included
selected_sentences = {s['sentence'] for s in selected}
for ss in sentence_scores:
ss['included'] = ss['sentence'] in selected_sentences
# Explainability: LIME and SHAP
explanations = {'lime': [], 'shap': []}
# Predictor function for LIME/SHAP
def predictor(texts):
scores = []
for t in texts:
try:
words = self.tokenize(t)
if not words:
scores.append([0.0, 1.0])
continue
temp_sent_words = [words]
temp_all_words = list(set(words))
temp_word_to_idx = {w: idx for idx, w in enumerate(temp_all_words)}
temp_tf = np.zeros((1, len(temp_all_words)))
word_count = {w: words.count(w) for w in words}
for w, count in word_count.items():
temp_tf[0, temp_word_to_idx[w]] = count / len(words)
temp_df = np.sum(temp_tf > 0, axis=0)
temp_idf = np.log(1 / (1 + temp_df))
temp_tfidf = temp_tf * temp_idf
norm = np.linalg.norm(temp_tfidf[0])
score = min(max(norm, 0.0), 1.0)
scores.append([score, 1.0 - score])
except Exception:
scores.append([0.0, 1.0])
return np.array(scores)
# LIME Explainer
lime_explainer = LimeTextExplainer(class_names=["Score", "Not Score"], bow=False)
top_indices = [s['position'] - 1 for s in sorted_scores[:2]]
for idx in top_indices:
try:
exp = lime_explainer.explain_instance(sentences[idx], predictor, num_features=10, num_samples=100)
explanations['lime'].append({'sentence': sentences[idx], 'exp': exp.as_list(label=0)})
except Exception as e:
explanations['lime'].append({'sentence': sentences[idx], 'exp': [('Error', f'LIME failed: {str(e)}')]})
# SHAP Explainer
background_texts = sentences[:min(10, len(sentences))]
shap_explainer = shap.KernelExplainer(predictor, background_texts)
for idx in top_indices:
try:
shap_values = shap_explainer.shap_values(sentences[idx], nsamples=100)[0]
explanations['shap'].append({'sentence': sentences[idx], 'shap_values': shap_values})
except Exception as e:
explanations['shap'].append({'sentence': sentences[idx], 'shap_values': [0.0], 'error': str(e)})
keywords = self.extract_keywords(text)
duration = time.time() - start_time
return {'summary': summary, 'sentences': sentence_scores, 'keywords': keywords, 'duration': duration, 'explanations': explanations}
# ----------------- App Interface -----------------
st.title("🧠 اردو متن خلاصہ ساز")
st.sidebar.header("📑 متن یا مثال منتخب کریں")
summarizer = UrduTextSummarizer()
sample_text = "انڈیا کے زیر انتظام کشمیر میں واقع سلال ڈیم اُس وقت زیرِ بحث آیا ..."
selected_sample = st.sidebar.selectbox("مثال منتخب کریں:", ["اپنا متن درج کریں", "نمونہ متن"])
user_input = st.text_area("اپنا اردو متن یہاں درج کریں:", height=200) if selected_sample=="اپنا متن درج کریں" else sample_text
if st.button("خلاصہ تیار کریں") and user_input.strip():
result = summarizer.summarize_text(user_input)
# ----------------- Multi Tabs -----------------
tabs = st.tabs(["📄 خلاصہ", "🧩 جملے", "🔑 اہم الفاظ", "📊 گراف", "🛠 Explainability"])
# --- Summary Tab ---
with tabs[0]:
st.markdown(f"<div class='summary-box urdu-text'>{result['summary']}</div>", unsafe_allow_html=True)
# --- Sentences Tab ---
with tabs[1]:
st.subheader("جملوں کی اہمیت")
sorted_sents = sorted(result['sentences'], key=lambda x: x['score'], reverse=True)
for s in sorted_sents:
bar_width = min(int(s['score']*100), 100)
st.markdown(f"<div class='sentence-text'>{s['sentence']}</div>"
f"<div class='sentence-bar' style='width:{bar_width}%; background-color:#667eea'></div>"
f"<div class='score-text'>اسکور: {s['score']:.2f}</div>", unsafe_allow_html=True)
# --- Keywords Tab ---
with tabs[2]:
df_kw = pd.DataFrame(result['keywords']['keywords'])
st.dataframe(df_kw)
fig_bubble = px.scatter(df_kw, x='word', y='frequency', size='frequency', color='importance',
size_max=60, color_continuous_scale='Viridis', title='Keywords by Frequency & Importance')
st.plotly_chart(fig_bubble, use_container_width=True, key="keyword_bubble")
# --- Graphs Tab ---
with tabs[3]:
st.info("اس ٹیب میں فی الحال کوئی گراف شامل نہیں ہے۔")
# --- Explainability Tab ---
with tabs[4]:
st.subheader("LIME Explanations (Word Contributions)")
for i, exp in enumerate(result['explanations']['lime']):
st.markdown(f"**جملہ {i+1}:** {exp['sentence']}")
df_lime = pd.DataFrame(exp['exp'], columns=['Word', 'Contribution'])
fig_lime = px.bar(df_lime, x='Contribution', y='Word', orientation='h', title=f'LIME Feature Contributions (Sentence {i+1})')
st.plotly_chart(fig_lime, use_container_width=True, key=f"lime_bar_{i}")
st.subheader("SHAP Explanations (Shapley Values)")
for i, exp in enumerate(result['explanations']['shap']):
st.markdown(f"**جملہ {i+1}:** {exp['sentence']}")
if 'error' in exp:
st.error(f"SHAP failed: {exp['error']}")
else:
df_shap = pd.DataFrame({'Word': exp['sentence'].split()[:len(exp['shap_values'])], 'SHAP Value': exp['shap_values']})
fig_shap = px.bar(df_shap, x='SHAP Value', y='Word', orientation='h', title=f'SHAP Feature Contributions (Sentence {i+1})')
st.plotly_chart(fig_shap, use_container_width=True, key=f"shap_bar_{i}")
st.info(f"⏱ خلاصہ تیار کرنے میں وقت: {result['duration']:.2f} سیکنڈ")