Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import re | |
| import time | |
| from typing import List, Dict | |
| import pandas as pd | |
| import plotly.express as px | |
| import numpy as np | |
| import networkx as nx | |
| from lime.lime_text import LimeTextExplainer | |
| import shap | |
| # ----------------- Streamlit Page Config ----------------- | |
| st.set_page_config( | |
| page_title="🧠 اردو متن خلاصہ ساز", | |
| page_icon="🧠", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # ----------------- Custom CSS ----------------- | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Noto+Nastaliq+Urdu:wght@400;700&display=swap'); | |
| .urdu-text { font-family: 'Noto Nastaliq Urdu', Arial, sans-serif; direction: rtl; text-align: right; line-height: 2; } | |
| .summary-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0; } | |
| .sentence-bar { height: 20px; border-radius: 5px; margin-bottom:5px; } | |
| .sentence-text { font-size: 16px; margin-bottom:3px; } | |
| .score-text { font-size: 14px; color: #333; margin-top: 5px; direction: rtl; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ----------------- Urdu Text Summarizer ----------------- | |
| class UrduTextSummarizer: | |
| def __init__(self): | |
| self.urdu_stop_words = {'اور','کا','کی','کے','میں','سے','کو','نے','ہے','ہیں','تھا','تھی','تھے', | |
| 'گا','گی','گے','کہ','جو','یہ','وہ','اس','ان','پر','کر','کرنا','کیا', | |
| 'ہو','ہوا','ہوئی','ہوئے','بھی','تو','ہی','لیے','ساتھ','بعد','پہلے'} | |
| def tokenize(self, sentence: str) -> List[str]: | |
| if isinstance(sentence, bytes): | |
| sentence = sentence.decode('utf-8', errors='ignore') | |
| elif not isinstance(sentence, str): | |
| sentence = str(sentence) | |
| words = re.sub(r'[۔،؟!؛:]', '', sentence).split() | |
| return [w for w in words if w not in self.urdu_stop_words and len(w) > 2] | |
| def extract_keywords(self, text: str) -> Dict: | |
| words = re.sub(r'[۔،؟!؛:]', '', text).split() | |
| word_freq = {} | |
| for w in words: | |
| if w not in self.urdu_stop_words and len(w) > 2: | |
| word_freq[w] = word_freq.get(w, 0) + 1 | |
| sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10] | |
| return {'keywords': [{'word': w, 'frequency': f, 'importance': min(1.0, f/5)} for w, f in sorted_words]} | |
| def get_sentence_score(self, sentences: List[str], tfidf: np.ndarray, sim_matrix: np.ndarray, i: int) -> float: | |
| vec = tfidf[i] | |
| norm = np.linalg.norm(vec) | |
| avg_sim = np.mean(sim_matrix[i]) | |
| return (norm + avg_sim) / 2 if norm > 0 else 0.0 | |
| def summarize_text(self, text: str) -> Dict: | |
| start_time = time.time() | |
| sentences = [s.strip() for s in re.split(r'[۔!?]', text) if s.strip()] | |
| if len(sentences) == 0: | |
| return {'summary': '', 'sentences': [], 'keywords': {'keywords': []}, 'duration': 0.0, 'explanations': {}} | |
| if len(sentences) < 2: | |
| return {'summary': sentences[0], 'sentences': [{'sentence': sentences[0], 'score': 1.0, 'included': True, 'position': 1}], | |
| 'keywords': self.extract_keywords(text), 'duration': time.time() - start_time, 'explanations': {}} | |
| # Tokenize sentences | |
| sent_words = [self.tokenize(s) for s in sentences] | |
| all_words = list(set(w for sw in sent_words for w in sw)) | |
| if not all_words: | |
| summary = '۔ '.join(sentences[:3]) | |
| sentence_scores = [{'sentence': s, 'score': 0.0, 'included': i < 3, 'position': i+1} for i, s in enumerate(sentences)] | |
| return {'summary': summary, 'sentences': sentence_scores, 'keywords': {'keywords': []}, 'duration': time.time() - start_time, 'explanations': {}} | |
| word_to_idx = {w: i for i, w in enumerate(all_words)} | |
| num_words = len(all_words) | |
| num_sents = len(sentences) | |
| # TF matrix | |
| tf = np.zeros((num_sents, num_words)) | |
| for i, words in enumerate(sent_words): | |
| word_count = {} | |
| for w in words: | |
| word_count[w] = word_count.get(w, 0) + 1 | |
| for w, count in word_count.items(): | |
| tf[i, word_to_idx[w]] = count / len(words) if len(words) > 0 else 0 | |
| # IDF | |
| df = np.sum(tf > 0, axis=0) | |
| idf = np.log(num_sents / (1 + df)) | |
| # TF-IDF | |
| tfidf = tf * idf | |
| # Similarity matrix | |
| sim_matrix = np.zeros((num_sents, num_sents)) | |
| for i in range(num_sents): | |
| for j in range(num_sents): | |
| if i != j: | |
| vec_i = tfidf[i] | |
| vec_j = tfidf[j] | |
| norm_i = np.linalg.norm(vec_i) | |
| norm_j = np.linalg.norm(vec_j) | |
| if norm_i > 0 and norm_j > 0: | |
| sim_matrix[i, j] = np.dot(vec_i, vec_j) / (norm_i * norm_j) | |
| # Graph and PageRank | |
| graph = nx.from_numpy_array(sim_matrix) | |
| try: | |
| scores = nx.pagerank(graph, max_iter=200, tol=1e-6) | |
| except nx.PowerIterationFailedConvergence: | |
| scores = {i: 1.0 / num_sents for i in range(num_sents)} | |
| # Sentence scores with position bonus | |
| sentence_scores = [] | |
| max_score = max(scores.values()) if scores else 1.0 | |
| for i in range(num_sents): | |
| base_score = scores.get(i, 0.0) / max_score if max_score > 0 else 0.0 | |
| position_bonus = 0.1 if i == 0 or i == num_sents - 1 else 0 | |
| final_score = base_score + position_bonus | |
| sentence_scores.append({'sentence': sentences[i], 'score': final_score, 'included': False, 'position': i+1}) | |
| # Select top N sentences | |
| N = max(3, min(6, int(len(sentences) * 0.3))) | |
| sorted_scores = sorted(sentence_scores, key=lambda x: x['score'], reverse=True)[:N] | |
| # Redundancy check | |
| selected = [] | |
| for cand in sorted_scores: | |
| add = True | |
| cand_vec = tfidf[cand['position'] - 1] | |
| for sel in selected: | |
| sel_vec = tfidf[sel['position'] - 1] | |
| sim = np.dot(cand_vec, sel_vec) / (np.linalg.norm(cand_vec) * np.linalg.norm(sel_vec)) if np.linalg.norm(cand_vec) > 0 and np.linalg.norm(sel_vec) > 0 else 0 | |
| if sim > 0.8: | |
| add = False | |
| break | |
| if add: | |
| selected.append(cand) | |
| # Sort selected by original position | |
| summary_sents = sorted(selected, key=lambda x: x['position']) | |
| summary = '۔ '.join([s['sentence'] for s in summary_sents]) + '۔' if summary_sents else '' | |
| # Update included | |
| selected_sentences = {s['sentence'] for s in selected} | |
| for ss in sentence_scores: | |
| ss['included'] = ss['sentence'] in selected_sentences | |
| # Explainability: LIME and SHAP | |
| explanations = {'lime': [], 'shap': []} | |
| # Predictor function for LIME/SHAP | |
| def predictor(texts): | |
| scores = [] | |
| for t in texts: | |
| try: | |
| words = self.tokenize(t) | |
| if not words: | |
| scores.append([0.0, 1.0]) | |
| continue | |
| temp_sent_words = [words] | |
| temp_all_words = list(set(words)) | |
| temp_word_to_idx = {w: idx for idx, w in enumerate(temp_all_words)} | |
| temp_tf = np.zeros((1, len(temp_all_words))) | |
| word_count = {w: words.count(w) for w in words} | |
| for w, count in word_count.items(): | |
| temp_tf[0, temp_word_to_idx[w]] = count / len(words) | |
| temp_df = np.sum(temp_tf > 0, axis=0) | |
| temp_idf = np.log(1 / (1 + temp_df)) | |
| temp_tfidf = temp_tf * temp_idf | |
| norm = np.linalg.norm(temp_tfidf[0]) | |
| score = min(max(norm, 0.0), 1.0) | |
| scores.append([score, 1.0 - score]) | |
| except Exception: | |
| scores.append([0.0, 1.0]) | |
| return np.array(scores) | |
| # LIME Explainer | |
| lime_explainer = LimeTextExplainer(class_names=["Score", "Not Score"], bow=False) | |
| top_indices = [s['position'] - 1 for s in sorted_scores[:2]] | |
| for idx in top_indices: | |
| try: | |
| exp = lime_explainer.explain_instance(sentences[idx], predictor, num_features=10, num_samples=100) | |
| explanations['lime'].append({'sentence': sentences[idx], 'exp': exp.as_list(label=0)}) | |
| except Exception as e: | |
| explanations['lime'].append({'sentence': sentences[idx], 'exp': [('Error', f'LIME failed: {str(e)}')]}) | |
| # SHAP Explainer | |
| background_texts = sentences[:min(10, len(sentences))] | |
| shap_explainer = shap.KernelExplainer(predictor, background_texts) | |
| for idx in top_indices: | |
| try: | |
| shap_values = shap_explainer.shap_values(sentences[idx], nsamples=100)[0] | |
| explanations['shap'].append({'sentence': sentences[idx], 'shap_values': shap_values}) | |
| except Exception as e: | |
| explanations['shap'].append({'sentence': sentences[idx], 'shap_values': [0.0], 'error': str(e)}) | |
| keywords = self.extract_keywords(text) | |
| duration = time.time() - start_time | |
| return {'summary': summary, 'sentences': sentence_scores, 'keywords': keywords, 'duration': duration, 'explanations': explanations} | |
| # ----------------- App Interface ----------------- | |
| st.title("🧠 اردو متن خلاصہ ساز") | |
| st.sidebar.header("📑 متن یا مثال منتخب کریں") | |
| summarizer = UrduTextSummarizer() | |
| sample_text = "انڈیا کے زیر انتظام کشمیر میں واقع سلال ڈیم اُس وقت زیرِ بحث آیا ..." | |
| selected_sample = st.sidebar.selectbox("مثال منتخب کریں:", ["اپنا متن درج کریں", "نمونہ متن"]) | |
| user_input = st.text_area("اپنا اردو متن یہاں درج کریں:", height=200) if selected_sample=="اپنا متن درج کریں" else sample_text | |
| if st.button("خلاصہ تیار کریں") and user_input.strip(): | |
| result = summarizer.summarize_text(user_input) | |
| # ----------------- Multi Tabs ----------------- | |
| tabs = st.tabs(["📄 خلاصہ", "🧩 جملے", "🔑 اہم الفاظ", "📊 گراف", "🛠 Explainability"]) | |
| # --- Summary Tab --- | |
| with tabs[0]: | |
| st.markdown(f"<div class='summary-box urdu-text'>{result['summary']}</div>", unsafe_allow_html=True) | |
| # --- Sentences Tab --- | |
| with tabs[1]: | |
| st.subheader("جملوں کی اہمیت") | |
| sorted_sents = sorted(result['sentences'], key=lambda x: x['score'], reverse=True) | |
| for s in sorted_sents: | |
| bar_width = min(int(s['score']*100), 100) | |
| st.markdown(f"<div class='sentence-text'>{s['sentence']}</div>" | |
| f"<div class='sentence-bar' style='width:{bar_width}%; background-color:#667eea'></div>" | |
| f"<div class='score-text'>اسکور: {s['score']:.2f}</div>", unsafe_allow_html=True) | |
| # --- Keywords Tab --- | |
| with tabs[2]: | |
| df_kw = pd.DataFrame(result['keywords']['keywords']) | |
| st.dataframe(df_kw) | |
| fig_bubble = px.scatter(df_kw, x='word', y='frequency', size='frequency', color='importance', | |
| size_max=60, color_continuous_scale='Viridis', title='Keywords by Frequency & Importance') | |
| st.plotly_chart(fig_bubble, use_container_width=True, key="keyword_bubble") | |
| # --- Graphs Tab --- | |
| with tabs[3]: | |
| st.info("اس ٹیب میں فی الحال کوئی گراف شامل نہیں ہے۔") | |
| # --- Explainability Tab --- | |
| with tabs[4]: | |
| st.subheader("LIME Explanations (Word Contributions)") | |
| for i, exp in enumerate(result['explanations']['lime']): | |
| st.markdown(f"**جملہ {i+1}:** {exp['sentence']}") | |
| df_lime = pd.DataFrame(exp['exp'], columns=['Word', 'Contribution']) | |
| fig_lime = px.bar(df_lime, x='Contribution', y='Word', orientation='h', title=f'LIME Feature Contributions (Sentence {i+1})') | |
| st.plotly_chart(fig_lime, use_container_width=True, key=f"lime_bar_{i}") | |
| st.subheader("SHAP Explanations (Shapley Values)") | |
| for i, exp in enumerate(result['explanations']['shap']): | |
| st.markdown(f"**جملہ {i+1}:** {exp['sentence']}") | |
| if 'error' in exp: | |
| st.error(f"SHAP failed: {exp['error']}") | |
| else: | |
| df_shap = pd.DataFrame({'Word': exp['sentence'].split()[:len(exp['shap_values'])], 'SHAP Value': exp['shap_values']}) | |
| fig_shap = px.bar(df_shap, x='SHAP Value', y='Word', orientation='h', title=f'SHAP Feature Contributions (Sentence {i+1})') | |
| st.plotly_chart(fig_shap, use_container_width=True, key=f"shap_bar_{i}") | |
| st.info(f"⏱ خلاصہ تیار کرنے میں وقت: {result['duration']:.2f} سیکنڈ") |