Spaces:
Sleeping
Sleeping
| """ | |
| Urdu Sarcasm Detection and Explanation System | |
| Group 16: Muhammad Yahya Rahim, Ishal Rahat, Ammara Haroon | |
| """ | |
| import streamlit as st | |
| import torch | |
| from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| # Page config | |
| st.set_page_config( | |
| page_title="Urdu Sarcasm Interpreter", | |
| page_icon="🎭", | |
| layout="wide" | |
| ) | |
| st.markdown(""" | |
| <style> | |
| /* Global */ | |
| html, body, [class*="css"] { | |
| background-color: #0e0e0e; | |
| color: #e6e6e6; | |
| } | |
| /* Headers */ | |
| .main-header { | |
| font-size: 2.4rem; | |
| color: #a62d2d; /* maroon */ | |
| text-align: center; | |
| margin-bottom: 1.5rem; | |
| font-weight: 700; | |
| letter-spacing: 1px; | |
| } | |
| /* Result boxes */ | |
| .result-box { | |
| padding: 1.4rem; | |
| border-radius: 12px; | |
| margin: 1rem 0; | |
| background-color: #151515; | |
| } | |
| /* Sarcastic */ | |
| .sarcastic { | |
| border-left: 5px solid #7b1e1e; | |
| } | |
| /* Not sarcastic */ | |
| .not-sarcastic { | |
| border-left: 5px solid #2e7d32; | |
| } | |
| /* Urdu text */ | |
| .urdu-text { | |
| font-family: 'Noto Nastaliq Urdu', serif; | |
| font-size: 1.35rem; | |
| direction: rtl; | |
| text-align: right; | |
| color: #f2f2f2; | |
| } | |
| /* Buttons */ | |
| button[kind="primary"] { | |
| background-color: #7b1e1e !important; | |
| border: none; | |
| } | |
| button { | |
| background-color: #1c1c1c !important; | |
| color: #e6e6e6 !important; | |
| border-radius: 8px !important; | |
| border: 1px solid #333 !important; | |
| } | |
| /* Sidebar */ | |
| section[data-testid="stSidebar"] { | |
| background-color: #121212; | |
| border-right: 1px solid #2a2a2a; | |
| } | |
| /* Inputs */ | |
| textarea { | |
| background-color: #111 !important; | |
| color: #f2f2f2 !important; | |
| border-radius: 8px !important; | |
| border: 1px solid #333 !important; | |
| } | |
| /* Info box */ | |
| div[data-testid="stInfo"] { | |
| background-color: #141414; | |
| border-left: 5px solid #555; | |
| } | |
| /* Footer */ | |
| .footer { | |
| color: #888; | |
| font-size: 0.9rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Cache model loading | |
| def load_models(): | |
| with st.spinner("Loading models... (this may take a minute)"): | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| cache_dir = "./model_cache" | |
| # Load detector | |
| detector_tokenizer = XLMRobertaTokenizer.from_pretrained( | |
| "ishalr/urdu-sarcasm-detectorfin" | |
| ) | |
| detector_model = XLMRobertaForSequenceClassification.from_pretrained( | |
| "ishalr/urdu-sarcasm-detectorfin" | |
| ) | |
| detector_model.to(device).eval() | |
| # Load explainer tokenizer (from adapter repo) | |
| explainer_tokenizer = AutoTokenizer.from_pretrained( | |
| "ishalr/urdu-sarcasm-explainer", | |
| use_fast=False | |
| ) | |
| # Load BASE model you trained LoRA on | |
| explainer_model = AutoModelForCausalLM.from_pretrained( | |
| "facebook/xglm-1.7B" | |
| ) | |
| # IMPORTANT: align vocab size | |
| explainer_model.resize_token_embeddings(len(explainer_tokenizer)) | |
| # Load LoRA adapter | |
| explainer_model = PeftModel.from_pretrained( | |
| explainer_model, | |
| "ishalr/urdu-sarcasm-explainer" | |
| ) | |
| explainer_model.to(device).eval() | |
| return { | |
| 'detector_model': detector_model, | |
| 'detector_tokenizer': detector_tokenizer, | |
| 'explainer_model': explainer_model, | |
| 'explainer_tokenizer': explainer_tokenizer, | |
| 'device': device | |
| } | |
| def detect_sarcasm(text, models): | |
| """Detect if text is sarcastic""" | |
| encoding = models['detector_tokenizer']( | |
| text, | |
| truncation=True, | |
| padding='max_length', | |
| max_length=128, | |
| return_tensors='pt' | |
| ).to(models['device']) | |
| with torch.no_grad(): | |
| outputs = models['detector_model'](**encoding) | |
| logits = outputs.logits | |
| probs = torch.softmax(logits, dim=-1) | |
| confidence, predicted_class = torch.max(probs, dim=-1) | |
| return { | |
| 'is_sarcastic': predicted_class.item() == 1, | |
| 'confidence': confidence.item(), | |
| 'sarcastic_prob': probs[0][1].item(), | |
| 'not_sarcastic_prob': probs[0][0].item() | |
| } | |
| def explain_sarcasm(text, models): | |
| """Generate explanation for sarcastic text""" | |
| prompt = f"""### Instruction: | |
| Explain why this Urdu tweet is sarcastic in Urdu. Provide: | |
| 1) ظاہری معنی (Literal meaning) | |
| 2) اصل مطلب (Intended meaning) No repitition | |
| ### Input: | |
| {text} | |
| ### Response: | |
| """ | |
| inputs = models['explainer_tokenizer']( | |
| prompt, | |
| return_tensors="pt", | |
| max_length=512, | |
| truncation=True | |
| ).to(models['device']) | |
| with torch.no_grad(): | |
| outputs = models['explainer_model'].generate( | |
| **inputs, | |
| max_length=256, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| pad_token_id=models['explainer_tokenizer'].eos_token_id | |
| ) | |
| response = models['explainer_tokenizer'].decode(outputs[0], skip_special_tokens=True) | |
| explanation = response.split("### Response:")[-1].strip() | |
| return explanation | |
| # Main app | |
| def main(): | |
| # Header | |
| st.markdown('<h1 class="main-header">اردو طنز کا تجزیہ کار</h1>', unsafe_allow_html=True) | |
| st.markdown('<h1 class="main-header">Urdu Sarcasm Interpreter</h1>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <div style='text-align: center; margin-bottom: 2rem;'> | |
| <p style='font-size: 1.2rem;'> | |
| Cross-Generational Communication Tool<br> | |
| <strong>Not just detection - we explain WHY text is sarcastic</strong> | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Sidebar | |
| with st.sidebar: | |
| st.header("About") | |
| st.write(""" | |
| This tool detects sarcasm in Urdu text and provides detailed explanations | |
| to help bridge understanding gaps between generations. | |
| """) | |
| st.header("Example Inputs") | |
| examples = [ | |
| "جی واہ! آج پھر لوڈشیڈنگ کا نیا ریکارڈ بنا، حکومت کا شکریہ کہ ہمیں اندھیرے میں بیٹھنے کا موقع دیا 😂", | |
| "آج موسم اچھا ہے", | |
| "شاندار کارکردگی! پٹرول کی قیمت پھر بڑھا دی، عوام تو خوش ہو گئے ہوں گے", | |
| ] | |
| for ex in examples: | |
| if st.button(ex, key=ex): | |
| st.session_state['input_text'] = ex | |
| # Load models | |
| models = load_models() | |
| # Input area | |
| st.markdown("### Enter Urdu Text") | |
| input_text = st.text_area( | |
| "Type or paste Urdu text here:", | |
| value=st.session_state.get('input_text', ''), | |
| height=150, | |
| placeholder="یہاں اردو متن لکھیں یا چسپاں کریں...", | |
| key='text_input' | |
| ) | |
| col1, col2 = st.columns([1, 4]) | |
| with col1: | |
| analyze_button = st.button("Analyze Text", type="primary", use_container_width=True) | |
| with col2: | |
| if st.button("Clear", use_container_width=True): | |
| st.session_state['input_text'] = '' | |
| st.rerun() | |
| # Analysis | |
| if analyze_button and input_text.strip(): | |
| with st.spinner("Analyzing..."): | |
| # Detect sarcasm | |
| result = detect_sarcasm(input_text, models) | |
| # Display input | |
| st.markdown("**Input Text:**") | |
| st.markdown(f'<div class="urdu-text">{input_text}</div>', unsafe_allow_html=True) | |
| # Results in columns | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("### Detection Result") | |
| if result['is_sarcastic']: | |
| st.markdown(f""" | |
| <div class="result-box sarcastic"> | |
| <h3>✓ Sarcastic</h3> | |
| <p><strong>Confidence:</strong> {result['confidence']:.1%}</p> | |
| <p><strong>Sarcastic Probability:</strong> {result['sarcastic_prob']:.1%}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.markdown(f""" | |
| <div class="result-box not-sarcastic"> | |
| <h3>✓ Not Sarcastic</h3> | |
| <p><strong>Confidence:</strong> {result['confidence']:.1%}</p> | |
| <p><strong>Not Sarcastic Probability:</strong> {result['not_sarcastic_prob']:.1%}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Probability chart | |
| import pandas as pd | |
| prob_df = pd.DataFrame({ | |
| 'Category': ['Not Sarcastic', 'Sarcastic'], | |
| 'Probability': [result['not_sarcastic_prob'], result['sarcastic_prob']] | |
| }) | |
| st.bar_chart(prob_df.set_index('Category')) | |
| with col2: | |
| st.markdown("### Explanation") | |
| if result['is_sarcastic']: | |
| with st.spinner("Generating explanation..."): | |
| explanation = explain_sarcasm(input_text, models) | |
| st.markdown( | |
| f""" | |
| <div class="urdu-text" style=" | |
| background-color: #1a1a1a; | |
| padding: 1.2rem; | |
| border-radius: 10px; | |
| border-left: 4px solid #7b1e1e; | |
| "> | |
| {explanation} | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| else: | |
| st.info("یہ پیغام سیدھا اور واضح ہے۔ اس میں طنز کی کوئی علامت نہیں ہے۔") | |
| elif analyze_button: | |
| st.warning("Please enter some text to analyze") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center; color: #666;'> | |
| <p> | |
| Cross-Generational Sarcasm Interpreter | Group 16<br> | |
| Muhammad Yahya Rahim, Ishal Rahat, Ammara Haroon | |
| </p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() |