Spaces:

Subayyal
/

Urdu-Summarizer

Sleeping

App Files Files Community

Urdu-Summarizer / app.py

Subayyal

Update app.py

4b53dcb verified 4 months ago

raw

history blame contribute delete

13.2 kB

	import streamlit as st
	import re
	import time
	from typing import List, Dict
	import pandas as pd
	import plotly.express as px
	import numpy as np
	import networkx as nx
	from lime.lime_text import LimeTextExplainer
	import shap

	# ----------------- Streamlit Page Config -----------------
	st.set_page_config(
	page_title="🧠 اردو متن خلاصہ ساز",
	page_icon="🧠",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# ----------------- Custom CSS -----------------
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Noto+Nastaliq+Urdu:wght@400;700&display=swap');

	.urdu-text { font-family: 'Noto Nastaliq Urdu', Arial, sans-serif; direction: rtl; text-align: right; line-height: 2; }
	.summary-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0; }
	.sentence-bar { height: 20px; border-radius: 5px; margin-bottom:5px; }
	.sentence-text { font-size: 16px; margin-bottom:3px; }
	.score-text { font-size: 14px; color: #333; margin-top: 5px; direction: rtl; }
	</style>
	""", unsafe_allow_html=True)

	# ----------------- Urdu Text Summarizer -----------------
	class UrduTextSummarizer:
	def __init__(self):
	self.urdu_stop_words = {'اور','کا','کی','کے','میں','سے','کو','نے','ہے','ہیں','تھا','تھی','تھے',
	'گا','گی','گے','کہ','جو','یہ','وہ','اس','ان','پر','کر','کرنا','کیا',
	'ہو','ہوا','ہوئی','ہوئے','بھی','تو','ہی','لیے','ساتھ','بعد','پہلے'}

	def tokenize(self, sentence: str) -> List[str]:
	if isinstance(sentence, bytes):
	sentence = sentence.decode('utf-8', errors='ignore')
	elif not isinstance(sentence, str):
	sentence = str(sentence)
	words = re.sub(r'[۔،؟!؛:]', '', sentence).split()
	return [w for w in words if w not in self.urdu_stop_words and len(w) > 2]

	def extract_keywords(self, text: str) -> Dict:
	words = re.sub(r'[۔،؟!؛:]', '', text).split()
	word_freq = {}
	for w in words:
	if w not in self.urdu_stop_words and len(w) > 2:
	word_freq[w] = word_freq.get(w, 0) + 1
	sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
	return {'keywords': [{'word': w, 'frequency': f, 'importance': min(1.0, f/5)} for w, f in sorted_words]}

	def get_sentence_score(self, sentences: List[str], tfidf: np.ndarray, sim_matrix: np.ndarray, i: int) -> float:
	vec = tfidf[i]
	norm = np.linalg.norm(vec)
	avg_sim = np.mean(sim_matrix[i])
	return (norm + avg_sim) / 2 if norm > 0 else 0.0

	def summarize_text(self, text: str) -> Dict:
	start_time = time.time()
	sentences = [s.strip() for s in re.split(r'[۔!?]', text) if s.strip()]
	if len(sentences) == 0:
	return {'summary': '', 'sentences': [], 'keywords': {'keywords': []}, 'duration': 0.0, 'explanations': {}}
	if len(sentences) < 2:
	return {'summary': sentences[0], 'sentences': [{'sentence': sentences[0], 'score': 1.0, 'included': True, 'position': 1}],
	'keywords': self.extract_keywords(text), 'duration': time.time() - start_time, 'explanations': {}}

	# Tokenize sentences
	sent_words = [self.tokenize(s) for s in sentences]
	all_words = list(set(w for sw in sent_words for w in sw))
	if not all_words:
	summary = '۔ '.join(sentences[:3])
	sentence_scores = [{'sentence': s, 'score': 0.0, 'included': i < 3, 'position': i+1} for i, s in enumerate(sentences)]
	return {'summary': summary, 'sentences': sentence_scores, 'keywords': {'keywords': []}, 'duration': time.time() - start_time, 'explanations': {}}

	word_to_idx = {w: i for i, w in enumerate(all_words)}
	num_words = len(all_words)
	num_sents = len(sentences)

	# TF matrix
	tf = np.zeros((num_sents, num_words))
	for i, words in enumerate(sent_words):
	word_count = {}
	for w in words:
	word_count[w] = word_count.get(w, 0) + 1
	for w, count in word_count.items():
	tf[i, word_to_idx[w]] = count / len(words) if len(words) > 0 else 0

	# IDF
	df = np.sum(tf > 0, axis=0)
	idf = np.log(num_sents / (1 + df))

	# TF-IDF
	tfidf = tf * idf

	# Similarity matrix
	sim_matrix = np.zeros((num_sents, num_sents))
	for i in range(num_sents):
	for j in range(num_sents):
	if i != j:
	vec_i = tfidf[i]
	vec_j = tfidf[j]
	norm_i = np.linalg.norm(vec_i)
	norm_j = np.linalg.norm(vec_j)
	if norm_i > 0 and norm_j > 0:
	sim_matrix[i, j] = np.dot(vec_i, vec_j) / (norm_i * norm_j)

	# Graph and PageRank
	graph = nx.from_numpy_array(sim_matrix)
	try:
	scores = nx.pagerank(graph, max_iter=200, tol=1e-6)
	except nx.PowerIterationFailedConvergence:
	scores = {i: 1.0 / num_sents for i in range(num_sents)}

	# Sentence scores with position bonus
	sentence_scores = []
	max_score = max(scores.values()) if scores else 1.0
	for i in range(num_sents):
	base_score = scores.get(i, 0.0) / max_score if max_score > 0 else 0.0
	position_bonus = 0.1 if i == 0 or i == num_sents - 1 else 0
	final_score = base_score + position_bonus
	sentence_scores.append({'sentence': sentences[i], 'score': final_score, 'included': False, 'position': i+1})

	# Select top N sentences
	N = max(3, min(6, int(len(sentences) * 0.3)))
	sorted_scores = sorted(sentence_scores, key=lambda x: x['score'], reverse=True)[:N]

	# Redundancy check
	selected = []
	for cand in sorted_scores:
	add = True
	cand_vec = tfidf[cand['position'] - 1]
	for sel in selected:
	sel_vec = tfidf[sel['position'] - 1]
	sim = np.dot(cand_vec, sel_vec) / (np.linalg.norm(cand_vec) * np.linalg.norm(sel_vec)) if np.linalg.norm(cand_vec) > 0 and np.linalg.norm(sel_vec) > 0 else 0
	if sim > 0.8:
	add = False
	break
	if add:
	selected.append(cand)

	# Sort selected by original position
	summary_sents = sorted(selected, key=lambda x: x['position'])
	summary = '۔ '.join([s['sentence'] for s in summary_sents]) + '۔' if summary_sents else ''

	# Update included
	selected_sentences = {s['sentence'] for s in selected}
	for ss in sentence_scores:
	ss['included'] = ss['sentence'] in selected_sentences

	# Explainability: LIME and SHAP
	explanations = {'lime': [], 'shap': []}

	# Predictor function for LIME/SHAP
	def predictor(texts):
	scores = []
	for t in texts:
	try:
	words = self.tokenize(t)
	if not words:
	scores.append([0.0, 1.0])
	continue
	temp_sent_words = [words]
	temp_all_words = list(set(words))
	temp_word_to_idx = {w: idx for idx, w in enumerate(temp_all_words)}
	temp_tf = np.zeros((1, len(temp_all_words)))
	word_count = {w: words.count(w) for w in words}
	for w, count in word_count.items():
	temp_tf[0, temp_word_to_idx[w]] = count / len(words)
	temp_df = np.sum(temp_tf > 0, axis=0)
	temp_idf = np.log(1 / (1 + temp_df))
	temp_tfidf = temp_tf * temp_idf
	norm = np.linalg.norm(temp_tfidf[0])
	score = min(max(norm, 0.0), 1.0)
	scores.append([score, 1.0 - score])
	except Exception:
	scores.append([0.0, 1.0])
	return np.array(scores)

	# LIME Explainer
	lime_explainer = LimeTextExplainer(class_names=["Score", "Not Score"], bow=False)
	top_indices = [s['position'] - 1 for s in sorted_scores[:2]]
	for idx in top_indices:
	try:
	exp = lime_explainer.explain_instance(sentences[idx], predictor, num_features=10, num_samples=100)
	explanations['lime'].append({'sentence': sentences[idx], 'exp': exp.as_list(label=0)})
	except Exception as e:
	explanations['lime'].append({'sentence': sentences[idx], 'exp': [('Error', f'LIME failed: {str(e)}')]})

	# SHAP Explainer
	background_texts = sentences[:min(10, len(sentences))]
	shap_explainer = shap.KernelExplainer(predictor, background_texts)
	for idx in top_indices:
	try:
	shap_values = shap_explainer.shap_values(sentences[idx], nsamples=100)[0]
	explanations['shap'].append({'sentence': sentences[idx], 'shap_values': shap_values})
	except Exception as e:
	explanations['shap'].append({'sentence': sentences[idx], 'shap_values': [0.0], 'error': str(e)})

	keywords = self.extract_keywords(text)
	duration = time.time() - start_time
	return {'summary': summary, 'sentences': sentence_scores, 'keywords': keywords, 'duration': duration, 'explanations': explanations}

	# ----------------- App Interface -----------------
	st.title("🧠 اردو متن خلاصہ ساز")
	st.sidebar.header("📑 متن یا مثال منتخب کریں")
	summarizer = UrduTextSummarizer()

	sample_text = "انڈیا کے زیر انتظام کشمیر میں واقع سلال ڈیم اُس وقت زیرِ بحث آیا ..."
	selected_sample = st.sidebar.selectbox("مثال منتخب کریں:", ["اپنا متن درج کریں", "نمونہ متن"])
	user_input = st.text_area("اپنا اردو متن یہاں درج کریں:", height=200) if selected_sample=="اپنا متن درج کریں" else sample_text

	if st.button("خلاصہ تیار کریں") and user_input.strip():
	result = summarizer.summarize_text(user_input)

	# ----------------- Multi Tabs -----------------
	tabs = st.tabs(["📄 خلاصہ", "🧩 جملے", "🔑 اہم الفاظ", "📊 گراف", "🛠 Explainability"])

	# --- Summary Tab ---
	with tabs[0]:
	st.markdown(f"<div class='summary-box urdu-text'>{result['summary']}</div>", unsafe_allow_html=True)

	# --- Sentences Tab ---
	with tabs[1]:
	st.subheader("جملوں کی اہمیت")
	sorted_sents = sorted(result['sentences'], key=lambda x: x['score'], reverse=True)
	for s in sorted_sents:
	bar_width = min(int(s['score']*100), 100)
	st.markdown(f"<div class='sentence-text'>{s['sentence']}</div>"
	f"<div class='sentence-bar' style='width:{bar_width}%; background-color:#667eea'></div>"
	f"<div class='score-text'>اسکور: {s['score']:.2f}</div>", unsafe_allow_html=True)

	# --- Keywords Tab ---
	with tabs[2]:
	df_kw = pd.DataFrame(result['keywords']['keywords'])
	st.dataframe(df_kw)
	fig_bubble = px.scatter(df_kw, x='word', y='frequency', size='frequency', color='importance',
	size_max=60, color_continuous_scale='Viridis', title='Keywords by Frequency & Importance')
	st.plotly_chart(fig_bubble, use_container_width=True, key="keyword_bubble")

	# --- Graphs Tab ---
	with tabs[3]:
	st.info("اس ٹیب میں فی الحال کوئی گراف شامل نہیں ہے۔")

	# --- Explainability Tab ---
	with tabs[4]:
	st.subheader("LIME Explanations (Word Contributions)")
	for i, exp in enumerate(result['explanations']['lime']):
	st.markdown(f"جملہ {i+1}: {exp['sentence']}")
	df_lime = pd.DataFrame(exp['exp'], columns=['Word', 'Contribution'])
	fig_lime = px.bar(df_lime, x='Contribution', y='Word', orientation='h', title=f'LIME Feature Contributions (Sentence {i+1})')
	st.plotly_chart(fig_lime, use_container_width=True, key=f"lime_bar_{i}")

	st.subheader("SHAP Explanations (Shapley Values)")
	for i, exp in enumerate(result['explanations']['shap']):
	st.markdown(f"جملہ {i+1}: {exp['sentence']}")
	if 'error' in exp:
	st.error(f"SHAP failed: {exp['error']}")
	else:
	df_shap = pd.DataFrame({'Word': exp['sentence'].split()[:len(exp['shap_values'])], 'SHAP Value': exp['shap_values']})
	fig_shap = px.bar(df_shap, x='SHAP Value', y='Word', orientation='h', title=f'SHAP Feature Contributions (Sentence {i+1})')
	st.plotly_chart(fig_shap, use_container_width=True, key=f"shap_bar_{i}")

	st.info(f"⏱ خلاصہ تیار کرنے میں وقت: {result['duration']:.2f} سیکنڈ")