Spaces:

Azidan
/

textSum

Sleeping

App Files Files Community

Azidan commited on 12 days ago

Commit

5dbff08

verified ·

1 Parent(s): ecd8def

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -108

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import re
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from PyPDF2 import PdfReader
 import tempfile
 import torch
@@ -8,29 +8,29 @@ import torch
 # =========================
 # Model setup (CPU-safe, Multi-language)
 # =========================
-# Use T5-based models that support text2text-generation
-EN_SUMMARIZER_MODEL = "google/flan-t5-base"  # English - works with text2text
-AR_SUMMARIZER_MODEL = "csebuetnlp/mT5_multilingual_XLSum"  # Multilingual (includes Arabic)
-QA_MODEL = "google/flan-t5-small"  # Question generation
 print("Loading models... This may take a minute on first run.")
-# English summarizer using text2text-generation
-en_tokenizer = AutoTokenizer.from_pretrained(EN_SUMMARIZER_MODEL)
-en_model = AutoModelForSeq2SeqLM.from_pretrained(EN_SUMMARIZER_MODEL)
-# Multilingual summarizer (for Arabic and other languages)
-ar_tokenizer = AutoTokenizer.from_pretrained(AR_SUMMARIZER_MODEL)
-ar_model = AutoModelForSeq2SeqLM.from_pretrained(AR_SUMMARIZER_MODEL)
-# Question generator
-question_generator = pipeline(
-    "text2text-generation",
-    model=QA_MODEL,
-    device=-1  # CPU only
-)
-CHUNK_SIZE = 400  # Conservative chunk size for T5 models
 # =========================
 # Language Detection
@@ -39,8 +39,8 @@ def detect_language(text: str) -> str:
     """Simple heuristic: detect if text contains Arabic characters."""
     arabic_pattern = re.compile(r'[\u0600-\u06FF]')
     if arabic_pattern.search(text):
-        return "ar_AR"  # Arabic
-    return "en_XX"  # English
 # =========================
 # Utilities
@@ -52,7 +52,7 @@ def clean_text(text: str) -> str:
     text = re.sub(r"[.]{2,}", ".", text)
     text = re.sub(r"[']{2,}", "'", text)
     text = re.sub(r"\s+", " ", text)
-    sentences = re.split(r'(?<=[.!?؟])\s+', text)  # Added Arabic question mark
     seen = set()
     result = []
     for s in sentences:
@@ -64,7 +64,6 @@ def clean_text(text: str) -> str:
 def chunk_text(text: str, language: str):
     """Token-aware chunking to avoid model overflow."""
-    # Use appropriate tokenizer based on language
     tokenizer = ar_tokenizer if language == "ar_AR" else en_tokenizer
     tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -75,85 +74,113 @@ def chunk_text(text: str, language: str):
         chunks.append(chunk_text)
     return chunks
 def generate_questions(summary: str, language: str) -> str:
-    """Generate comprehension and critical thinking questions based on the summary."""
-    truncated_summary = summary[:800]
     if language == "ar_AR":
         prompt = (
-            f"اقرأ هذا الملخص: '{truncated_summary}'\n\n"
-            "أنشئ 7 أسئلة مختلفة:\n"
-            "- 3 أسئلة فهم (ماذا، من، أين)\n"
-            "- 2 أسئلة تطبيقية (كيف يمكن استخدام هذا؟)\n"
-            "- 2 أسئلة تحليلية (لماذا، ما العلاقة بين؟)\n"
-            "اكتب الأسئلة فقط، كل سؤال في سطر جديد."
         )
     else:
         prompt = (
-            f"Read this summary: '{truncated_summary}'\n\n"
-            "Generate exactly 7 diverse questions:\n"
-            "- 3 comprehension questions (What, Who, When, Where)\n"
-            "- 2 application questions (How can this be used? What if?)\n"
-            "- 2 analytical questions (Why, What's the relationship between?)\n"
-            "Write only the questions, one per line, numbered 1-7."
         )
     try:
-        generated = question_generator(
-            prompt,
-            max_length=400,
-            num_return_sequences=1,
-            do_sample=True,
-            temperature=0.8,
-            top_p=0.9
-        )[0]["generated_text"]
         # Parse questions
         questions = []
-        lines = generated.split('\n')
-        for line in lines:
             line = line.strip()
-            # Remove numbering if present
             line = re.sub(r'^\d+[\.\)]\s*', '', line)
             if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
                 questions.append(line)
         if not questions or len(questions) < 3:
-            # Fallback: generate basic questions
             if language == "ar_AR":
                 questions = [
                     "ما هي الفكرة الرئيسية في هذا النص؟",
-                    "من هم الأشخاص أو الجهات الرئيسية المذكورة؟",
-                    "كيف يمكن تطبيق هذه المعلومات في الحياة الواقعية؟",
-                    "ما هي النقاط الأكثر أهمية في الملخص؟",
-                    "لماذا هذا الموضوع مهم؟"
                 ]
             else:
                 questions = [
                     "What is the main idea of this text?",
-                    "Who are the key people or entities mentioned?",
-                    "How can this information be applied in real life?",
-                    "What are the most important points in the summary?",
-                    "Why is this topic significant?",
-                    "What connections can you make to other knowledge?",
                     "What questions remain unanswered?"
                 ]
-        # Format questions
         header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
         questions_md = header
         for i, q in enumerate(questions[:7], 1):
             questions_md += f"{i}. {q}\n"
-        footer = "\n**Tip**: Answer these questions without looking at the text to test your understanding!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة على هذه الأسئلة دون النظر إلى النص لاختبار فهمك!"
         questions_md += footer
         return questions_md
     except Exception as e:
-        return f"\n\n---\n\nError generating questions: {str(e)}\n"
 def extract_possible_headings(text: str) -> str:
-    """Attempt to extract potential titles and subtitles from raw text."""
     lines = text.split('\n')
     headings = []
     for line in lines:
@@ -174,17 +201,17 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
     if not text or len(text.strip()) == 0:
         return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
-    # Length mapping (for T5 models, these are approximate)
     length_map = {
         "Short (25%)": {"max": 128, "min": 30},
-        "Medium (50%)": {"max": 256, "min": 60},
-        "Long (75%)": {"max": 400, "min": 100},
         "قصير (25%)": {"max": 128, "min": 30},
-        "متوسط (50%)": {"max": 256, "min": 60},
-        "طويل (75%)": {"max": 400, "min": 100}
     }
-    length_params = length_map.get(summary_length, {"max": 256, "min": 60})
     progress(0, desc="Extracting headings...")
     headings_section = extract_possible_headings(text)
@@ -195,41 +222,16 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
     summaries = []
     progress(0.2, desc="Summarizing chunks...")
-    for i in progress.tqdm(range(len(chunks))):
         chunk = chunks[i]
-        try:
-            if language == "ar_AR":
-                # Use mT5 for Arabic with direct model inference
-                inputs = ar_tokenizer(chunk, return_tensors="pt", max_length=512, truncation=True)
-                summary_ids = ar_model.generate(
-                    inputs["input_ids"],
-                    max_length=length_params["max"],
-                    min_length=length_params["min"],
-                    length_penalty=2.0,
-                    num_beams=4,
-                    early_stopping=True
-                )
-                summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-            else:
-                # Use FLAN-T5 for English with summarization prompt
-                prompt = f"Summarize the following text in detail:\n\n{chunk}"
-                inputs = en_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
-                summary_ids = en_model.generate(
-                    inputs["input_ids"],
-                    max_length=length_params["max"],
-                    min_length=length_params["min"],
-                    num_beams=4,
-                    early_stopping=True
-                )
-                summary = en_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-            cleaned = clean_text(summary)
-            if cleaned:  # Only add non-empty summaries
-                chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
-                summaries.append(f"{chunk_label} {cleaned}")
-        except Exception as e:
-            print(f"Error in chunk {i}: {str(e)}")
-            continue  # skip problematic chunks
     # Format summaries
     header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
@@ -247,7 +249,7 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
     return headings_section + summary_md + questions
 def read_pdf(file) -> str:
-    """Safely extract text from PDF."""
     try:
         reader = PdfReader(file)
         pages = [page.extract_text() or "" for page in reader.pages]
@@ -298,7 +300,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
         "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
         "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
-        "⚠️ **Note**: First run may take 2-3 minutes to load models. Be patient!"
     )
     with gr.Row():
@@ -349,13 +351,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         "---\n"
         "### Tips for best results:\n"
-        "• For Arabic text, select 'Arabic' language for better results\n"
         "• Longer texts work better (500+ words)\n"
-        "• PDF quality affects extraction - clear text works best\n\n"
         "### نصائح لأفضل النتائج:\n"
         "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
         "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
-        "• جودة PDF تؤثر على الاستخراج - النص الواضح يعمل بشكل أفضل"
     )
-demo.launch()

 import gradio as gr
 import re
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from PyPDF2 import PdfReader
 import tempfile
 import torch
 # =========================
 # Model setup (CPU-safe, Multi-language)
 # =========================
 print("Loading models... This may take a minute on first run.")
+# Use T5 models - load directly without pipeline
+EN_MODEL_NAME = "google/flan-t5-base"
+AR_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
+# Load English model
+print("Loading English model...")
+en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME)
+en_model = AutoModelForSeq2SeqLM.from_pretrained(EN_MODEL_NAME)
+# Load Arabic/Multilingual model
+print("Loading Arabic model...")
+ar_tokenizer = AutoTokenizer.from_pretrained(AR_MODEL_NAME)
+ar_model = AutoModelForSeq2SeqLM.from_pretrained(AR_MODEL_NAME)
+# Load question generator (same as English model)
+qa_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+qa_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+CHUNK_SIZE = 400
+print("Models loaded successfully!")
 # =========================
 # Language Detection
     """Simple heuristic: detect if text contains Arabic characters."""
     arabic_pattern = re.compile(r'[\u0600-\u06FF]')
     if arabic_pattern.search(text):
+        return "ar_AR"
+    return "en_XX"
 # =========================
 # Utilities
     text = re.sub(r"[.]{2,}", ".", text)
     text = re.sub(r"[']{2,}", "'", text)
     text = re.sub(r"\s+", " ", text)
+    sentences = re.split(r'(?<=[.!?؟])\s+', text)
     seen = set()
     result = []
     for s in sentences:
 def chunk_text(text: str, language: str):
     """Token-aware chunking to avoid model overflow."""
     tokenizer = ar_tokenizer if language == "ar_AR" else en_tokenizer
     tokens = tokenizer.encode(text, add_special_tokens=False)
         chunks.append(chunk_text)
     return chunks
+def generate_summary(text: str, language: str, max_length: int, min_length: int) -> str:
+    """Generate summary using the appropriate model."""
+    try:
+        if language == "ar_AR":
+            # Arabic model
+            inputs = ar_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+            with torch.no_grad():
+                summary_ids = ar_model.generate(
+                    inputs["input_ids"],
+                    max_length=max_length,
+                    min_length=min_length,
+                    length_penalty=2.0,
+                    num_beams=4,
+                    early_stopping=True
+                )
+            summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        else:
+            # English model with instruction
+            prompt = f"Summarize this text in detail:\n\n{text}"
+            inputs = en_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+            with torch.no_grad():
+                summary_ids = en_model.generate(
+                    inputs["input_ids"],
+                    max_length=max_length,
+                    min_length=min_length,
+                    num_beams=4,
+                    early_stopping=True
+                )
+            summary = en_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return clean_text(summary)
+    except Exception as e:
+        print(f"Error generating summary: {str(e)}")
+        return ""
 def generate_questions(summary: str, language: str) -> str:
+    """Generate comprehension questions based on the summary."""
+    truncated_summary = summary[:600]
     if language == "ar_AR":
         prompt = (
+            f"اقرأ هذا النص: {truncated_summary}\n\n"
+            "اكتب 5 أسئلة مهمة عن هذا النص. كل سؤال في سطر جديد."
         )
     else:
         prompt = (
+            f"Read this text: {truncated_summary}\n\n"
+            "Write 5 important questions about this text. One question per line."
         )
     try:
+        inputs = qa_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+        with torch.no_grad():
+            question_ids = qa_model.generate(
+                inputs["input_ids"],
+                max_length=300,
+                num_beams=4,
+                early_stopping=True,
+                temperature=0.8
+            )
+        generated = qa_tokenizer.decode(question_ids[0], skip_special_tokens=True)
         # Parse questions
         questions = []
+        for line in generated.split('\n'):
             line = line.strip()
             line = re.sub(r'^\d+[\.\)]\s*', '', line)
             if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
                 questions.append(line)
+        # Fallback questions
         if not questions or len(questions) < 3:
             if language == "ar_AR":
                 questions = [
                     "ما هي الفكرة الرئيسية في هذا النص؟",
+                    "ما هي النقاط المهمة المذكورة؟",
+                    "كيف يمكن تطبيق هذه المعلومات؟",
+                    "لماذا هذا الموضوع مهم؟",
+                    "ما هي الاستنتاجات الرئيسية؟"
                 ]
             else:
                 questions = [
                     "What is the main idea of this text?",
+                    "What are the key points mentioned?",
+                    "How can this information be applied?",
+                    "Why is this topic important?",
+                    "What are the main conclusions?",
+                    "What connections can you make to other topics?",
                     "What questions remain unanswered?"
                 ]
+        # Format
         header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
         questions_md = header
         for i, q in enumerate(questions[:7], 1):
             questions_md += f"{i}. {q}\n"
+        footer = "\n**Tip**: Try to answer these without looking at the text!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة دون النظر إلى النص!"
         questions_md += footer
         return questions_md
     except Exception as e:
+        print(f"Error generating questions: {str(e)}")
+        return "\n\n---\n\nUnable to generate questions.\n"
 def extract_possible_headings(text: str) -> str:
+    """Extract potential titles and subtitles from raw text."""
     lines = text.split('\n')
     headings = []
     for line in lines:
     if not text or len(text.strip()) == 0:
         return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
+    # Length mapping
     length_map = {
         "Short (25%)": {"max": 128, "min": 30},
+        "Medium (50%)": {"max": 200, "min": 50},
+        "Long (75%)": {"max": 300, "min": 80},
         "قصير (25%)": {"max": 128, "min": 30},
+        "متوسط (50%)": {"max": 200, "min": 50},
+        "طويل (75%)": {"max": 300, "min": 80}
     }
+    length_params = length_map.get(summary_length, {"max": 200, "min": 50})
     progress(0, desc="Extracting headings...")
     headings_section = extract_possible_headings(text)
     summaries = []
     progress(0.2, desc="Summarizing chunks...")
+    total_chunks = len(chunks)
+    for i in range(total_chunks):
         chunk = chunks[i]
+        progress((0.2 + 0.6 * i / total_chunks), desc=f"Summarizing chunk {i+1}/{total_chunks}...")
+        summary = generate_summary(chunk, language, length_params["max"], length_params["min"])
+        if summary:
+            chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
+            summaries.append(f"{chunk_label} {summary}")
     # Format summaries
     header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
     return headings_section + summary_md + questions
 def read_pdf(file) -> str:
+    """Extract text from PDF."""
     try:
         reader = PdfReader(file)
         pages = [page.extract_text() or "" for page in reader.pages]
         "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
         "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
         "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
+        "⚠️ **Note**: First run may take 2-3 minutes to load models. Processing is slower on CPU."
     )
     with gr.Row():
     gr.Markdown(
         "---\n"
         "### Tips for best results:\n"
+        "• For Arabic text, select 'Arabic' for better results\n"
         "• Longer texts work better (500+ words)\n"
+        "• Processing may take 30-60 seconds on CPU\n\n"
         "### نصائح لأفضل النتائج:\n"
         "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
         "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
+        "• قد تستغرق المعالجة 30-60 ثانية على CPU"
     )
+if __name__ == "__main__":
+    demo.launch()