Spaces:

Azidan
/

textSum

Sleeping

App Files Files Community

Azidan commited on 13 days ago

Commit

e6b80d2

verified ·

1 Parent(s): bb331f0

Update app.py

Browse files

Files changed (1) hide show

app.py +216 -98

app.py CHANGED Viewed

@@ -1,42 +1,59 @@
 import gradio as gr
 import re
-from transformers import pipeline, AutoTokenizer
 from PyPDF2 import PdfReader
 import tempfile
 # =========================
-# Model setup (CPU-safe)
 # =========================
-# Use smaller, faster models to speed up processing
-MODEL_NAME = "sshleifer/distilbart-cnn-6-6"  # Smaller than 12-6, faster on CPU
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 summarizer = pipeline(
     "summarization",
-    model=MODEL_NAME,
-    tokenizer=tokenizer,
     device=-1  # CPU only
 )
-# Use smaller flan-t5-small for faster advice generation
-advice_generator = pipeline(
     "text2text-generation",
-    model="google/flan-t5-small",
     device=-1  # CPU only
 )
-CHUNK_SIZE = 900  # safe margin under typical max input
 # =========================
 # Utilities
 # =========================
 def clean_text(text: str) -> str:
     """Fix quotes, spacing, repetition, broken punctuation."""
-    text = text.replace("‘", "'").replace("’", "'")
-    text = text.replace("“", '"').replace("”", '"')
     text = re.sub(r"[.]{2,}", ".", text)
     text = re.sub(r"[']{2,}", "'", text)
     text = re.sub(r"\s+", " ", text)
-    sentences = re.split(r'(?<=[.!?])\s+', text)
     seen = set()
     result = []
     for s in sentences:
@@ -46,7 +63,7 @@ def clean_text(text: str) -> str:
             result.append(s.strip())
     return " ".join(result)
-def chunk_text(text: str):
     """Token-aware chunking to avoid model overflow."""
     tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
@@ -56,114 +73,179 @@ def chunk_text(text: str):
         chunks.append(chunk_text)
     return chunks
-def generate_ai_advice(summary: str) -> str:
-    """Generate personalized study advice based on the paper summary."""
-    truncated_summary = summary[:1000]
-    prompt = (
-        f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
-        "Generate exactly 5 practical study tips for a student to better understand and retain this content. "
-        "Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
-        "Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
-        "Output only the 5 tips as bullet points, nothing else."
-    )
-    generated = advice_generator(
-        prompt,
-        max_length=250,
-        num_return_sequences=1,
-        do_sample=False,
-        temperature=0.7
-    )[0]["generated_text"]
-    # Try to clean into bullet points
-    tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
-    if not tips or len(tips) < 3:
-        tips = [t.strip() for t in generated.split('.') if t.strip()]
-    advice_md = "\n\n---\n\n### 📚 AI-Generated Study Tips\n\n"
-    for i, tip in enumerate(tips[:5], 1):
-        clean_tip = tip.lstrip('- ').strip()
-        advice_md += f"- {clean_tip}\n"
-    advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
-    return advice_md
 def extract_possible_headings(text: str) -> str:
-    """Attempt to extract potential titles and subtitles from raw text.
-    This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
     lines = text.split('\n')
     headings = []
     for line in lines:
         stripped = line.strip()
-        if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
             headings.append(stripped)
     if headings:
-        return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
     return ""
-def summarize_long_text(text: str, progress=gr.Progress()) -> str:
-    """Summarize long text in chunks + add AI study advice.
-    Now with longer summaries per chunk and formatted as bullet points."""
     if not text or len(text.strip()) == 0:
-        return "No text provided."
     progress(0, desc="Extracting headings...")
-    # Extract possible headings first
     headings_section = extract_possible_headings(text)
     progress(0.1, desc="Chunking text...")
-    chunks = chunk_text(text)
     summaries = []
     progress(0.2, desc="Summarizing chunks...")
     for i in progress.tqdm(range(len(chunks))):
         chunk = chunks[i]
         try:
             summary = summarizer(
                 chunk,
-                max_length=200,  # Reduced slightly for speed (compromise between length and time)
-                min_length=60,   # Reduced for speed
-                do_sample=False
             )[0]["summary_text"]
             cleaned = clean_text(summary)
-            summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
-        except Exception:
             pass  # skip problematic chunks
-    # Format summaries as bullet points
-    summary_md = "### Detailed Summary (in Bullet Points)\n\n"
     for s in summaries:
         summary_md += f"- {s}\n"
-    progress(0.8, desc="Generating AI advice...")
-    ai_advice = generate_ai_advice(summary_md)  # Use the bulleted summary for advice generation
     progress(1, desc="Done!")
-    return headings_section + summary_md + ai_advice
 def read_pdf(file) -> str:
     """Safely extract text from PDF."""
     try:
         reader = PdfReader(file)
         pages = [page.extract_text() or "" for page in reader.pages]
-        return "\n".join(pages)  # Join with newlines to preserve line breaks for heading detection
     except Exception as e:
         return f"PDF read error: {str(e)}"
-# =========================
-# Download helper
-# =========================
 def create_download_file(content: str) -> str:
-    """Create temporary file for Gradio file download component"""
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
         tmp.write(content)
         return tmp.name
-# =========================
-# Main handler
-# =========================
-def process_input(text: str, file, progress=gr.Progress()):
     input_text = ""
     progress(0, desc="Reading input...")
@@ -174,7 +256,14 @@ def process_input(text: str, file, progress=gr.Progress()):
     else:
         return "Please paste some text or upload a PDF.", None
-    result = summarize_long_text(input_text, progress)
     download_path = create_download_file(result)
     return result, download_path
@@ -182,46 +271,75 @@ def process_input(text: str, file, progress=gr.Progress()):
 # =========================
 # Gradio UI
 # =========================
-with gr.Blocks() as demo:
-    gr.Markdown("# 📄 Long Text Summarizer + AI Study Assistant")
     gr.Markdown(
-        "• Handles very long documents (thousands of words)\n"
-        "• Supports **PDF** upload or direct paste\n"
-        "• Runs on CPU – works on free hardware\n"
-        "• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
-        "• Includes **5 AI-generated study tips** tailored to the content\n"
-        "• Download result as .txt file\n"
-        "**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
     )
     with gr.Row():
-        text_input = gr.Textbox(
-            lines=10,
-            label="Paste your text here (optional)",
-            placeholder="Paste lecture notes, article, book chapter...",
-        )
-        file_input = gr.File(
-            label="Or upload a PDF",
-            file_types=[".pdf"]
-        )
-    summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
     output = gr.Textbox(
-        lines=16,
-        label="Summary + AI-generated study advice",
         interactive=False
     )
     download_output = gr.File(
-        label="Download full result (.txt)",
         interactive=False
     )
     summarize_btn.click(
         fn=process_input,
-        inputs=[text_input, file_input],
         outputs=[output, download_output]
     )
 demo.launch()

 import gradio as gr
 import re
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from PyPDF2 import PdfReader
 import tempfile
+import torch
 # =========================
+# Model setup (CPU-safe, Multi-language)
 # =========================
+# Use mBART for multilingual support (English + Arabic)
+SUMMARIZER_MODEL = "facebook/mbart-large-50-many-to-many-mmt"
+QA_MODEL = "google/flan-t5-base"  # Better for question generation
+print("Loading models... This may take a minute on first run.")
+# Summarizer with mBART (supports Arabic)
+summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL)
+summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL)
 summarizer = pipeline(
     "summarization",
+    model=summarizer_model,
+    tokenizer=summarizer_tokenizer,
     device=-1  # CPU only
 )
+# Question generator
+question_generator = pipeline(
     "text2text-generation",
+    model=QA_MODEL,
     device=-1  # CPU only
 )
+CHUNK_SIZE = 512  # Conservative for mBART
+# =========================
+# Language Detection
+# =========================
+def detect_language(text: str) -> str:
+    """Simple heuristic: detect if text contains Arabic characters."""
+    arabic_pattern = re.compile(r'[\u0600-\u06FF]')
+    if arabic_pattern.search(text):
+        return "ar_AR"  # Arabic
+    return "en_XX"  # English
 # =========================
 # Utilities
 # =========================
 def clean_text(text: str) -> str:
     """Fix quotes, spacing, repetition, broken punctuation."""
+    text = text.replace("'", "'").replace("'", "'")
+    text = text.replace(""", '"').replace(""", '"')
     text = re.sub(r"[.]{2,}", ".", text)
     text = re.sub(r"[']{2,}", "'", text)
     text = re.sub(r"\s+", " ", text)
+    sentences = re.split(r'(?<=[.!?؟])\s+', text)  # Added Arabic question mark
     seen = set()
     result = []
     for s in sentences:
             result.append(s.strip())
     return " ".join(result)
+def chunk_text(text: str, tokenizer):
     """Token-aware chunking to avoid model overflow."""
     tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
         chunks.append(chunk_text)
     return chunks
+def generate_questions(summary: str, language: str) -> str:
+    """Generate comprehension and critical thinking questions based on the summary."""
+    truncated_summary = summary[:800]
+    if language == "ar_AR":
+        prompt = (
+            f"اقرأ هذا الملخص: '{truncated_summary}'\n\n"
+            "أنشئ 7 أسئلة مختلفة:\n"
+            "- 3 أسئلة فهم (ماذا، من، أين)\n"
+            "- 2 أسئلة تطبيقية (كيف يمكن استخدام هذا؟)\n"
+            "- 2 أسئلة تحليلية (لماذا، ما العلاقة بين؟)\n"
+            "اكتب الأسئلة فقط، كل سؤال في سطر جديد."
+        )
+    else:
+        prompt = (
+            f"Read this summary: '{truncated_summary}'\n\n"
+            "Generate exactly 7 diverse questions:\n"
+            "- 3 comprehension questions (What, Who, When, Where)\n"
+            "- 2 application questions (How can this be used? What if?)\n"
+            "- 2 analytical questions (Why, What's the relationship between?)\n"
+            "Write only the questions, one per line, numbered 1-7."
+        )
+    try:
+        generated = question_generator(
+            prompt,
+            max_length=400,
+            num_return_sequences=1,
+            do_sample=True,
+            temperature=0.8,
+            top_p=0.9
+        )[0]["generated_text"]
+        # Parse questions
+        questions = []
+        lines = generated.split('\n')
+        for line in lines:
+            line = line.strip()
+            # Remove numbering if present
+            line = re.sub(r'^\d+[\.\)]\s*', '', line)
+            if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
+                questions.append(line)
+        if not questions or len(questions) < 3:
+            # Fallback: generate basic questions
+            if language == "ar_AR":
+                questions = [
+                    "ما هي الفكرة الرئيسية في هذا النص؟",
+                    "من هم الأشخاص أو الجهات الرئيسية المذكورة؟",
+                    "كيف يمكن تطبيق هذه المعلومات في الحياة الواقعية؟",
+                    "ما هي النقاط الأكثر أهمية في الملخص؟",
+                    "لماذا هذا الموضوع مهم؟"
+                ]
+            else:
+                questions = [
+                    "What is the main idea of this text?",
+                    "Who are the key people or entities mentioned?",
+                    "How can this information be applied in real life?",
+                    "What are the most important points in the summary?",
+                    "Why is this topic significant?",
+                    "What connections can you make to other knowledge?",
+                    "What questions remain unanswered?"
+                ]
+        # Format questions
+        header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
+        questions_md = header
+        for i, q in enumerate(questions[:7], 1):
+            questions_md += f"{i}. {q}\n"
+        footer = "\n**Tip**: Answer these questions without looking at the text to test your understanding!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة على هذه الأسئلة دون النظر إلى النص لاختبار فهمك!"
+        questions_md += footer
+        return questions_md
+    except Exception as e:
+        return f"\n\n---\n\nError generating questions: {str(e)}\n"
 def extract_possible_headings(text: str) -> str:
+    """Attempt to extract potential titles and subtitles from raw text."""
     lines = text.split('\n')
     headings = []
     for line in lines:
         stripped = line.strip()
+        if stripped and (len(stripped) < 80) and (
+            stripped.isupper() or
+            re.match(r'^\d+\.?\s', stripped) or
+            re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped) or
+            re.match(r'^[الفصل|Chapter|Section]', stripped, re.IGNORECASE)
+        ):
             headings.append(stripped)
     if headings:
+        return "### 📋 Extracted Headings\n\n" + "\n- ".join([''] + headings[:10]) + "\n\n---\n\n"
     return ""
+def summarize_long_text(text: str, summary_length: str, language: str, progress=gr.Progress()) -> str:
+    """Summarize long text in chunks with configurable length + generate questions."""
     if not text or len(text.strip()) == 0:
+        return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
+    # Length mapping
+    length_map = {
+        "Short (25%)": {"max": 150, "min": 40},
+        "Medium (50%)": {"max": 250, "min": 80},
+        "Long (75%)": {"max": 400, "min": 120},
+        "قصير (25%)": {"max": 150, "min": 40},
+        "متوسط (50%)": {"max": 250, "min": 80},
+        "طويل (75%)": {"max": 400, "min": 120}
+    }
+    length_params = length_map.get(summary_length, {"max": 250, "min": 80})
     progress(0, desc="Extracting headings...")
     headings_section = extract_possible_headings(text)
     progress(0.1, desc="Chunking text...")
+    chunks = chunk_text(text, summarizer_tokenizer)
     summaries = []
     progress(0.2, desc="Summarizing chunks...")
+    # Set language tokens for mBART
+    src_lang = language
+    tgt_lang = language
     for i in progress.tqdm(range(len(chunks))):
         chunk = chunks[i]
         try:
+            # For mBART, we need to set source and target language
+            summarizer_tokenizer.src_lang = src_lang
             summary = summarizer(
                 chunk,
+                max_length=length_params["max"],
+                min_length=length_params["min"],
+                do_sample=False,
+                forced_bos_token_id=summarizer_tokenizer.lang_code_to_id[tgt_lang]
             )[0]["summary_text"]
             cleaned = clean_text(summary)
+            chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
+            summaries.append(f"{chunk_label} {cleaned}")
+        except Exception as e:
+            print(f"Error in chunk {i}: {str(e)}")
             pass  # skip problematic chunks
+    # Format summaries
+    header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
+    summary_md = header
     for s in summaries:
         summary_md += f"- {s}\n"
+    progress(0.8, desc="Generating questions...")
+    questions = generate_questions(summary_md, language)
     progress(1, desc="Done!")
+    return headings_section + summary_md + questions
 def read_pdf(file) -> str:
     """Safely extract text from PDF."""
     try:
         reader = PdfReader(file)
         pages = [page.extract_text() or "" for page in reader.pages]
+        return "\n".join(pages)
     except Exception as e:
         return f"PDF read error: {str(e)}"
 def create_download_file(content: str) -> str:
+    """Create temporary file for download"""
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
         tmp.write(content)
         return tmp.name
+def process_input(text: str, file, summary_length: str, language: str, progress=gr.Progress()):
     input_text = ""
     progress(0, desc="Reading input...")
     else:
         return "Please paste some text or upload a PDF.", None
+    # Auto-detect language if not specified
+    if language == "Auto-detect":
+        detected_lang = detect_language(input_text)
+        language = detected_lang
+    else:
+        language = "ar_AR" if "Arabic" in language or "عربي" in language else "en_XX"
+    result = summarize_long_text(input_text, summary_length, language, progress)
     download_path = create_download_file(result)
     return result, download_path
 # =========================
 # Gradio UI
 # =========================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 Multilingual Text Summarizer + Study Assistant")
+    gr.Markdown("# ملخص النصوص متعدد اللغات + مساعد الدراسة")
     gr.Markdown(
+        "### Features / المميزات:\n"
+        "• **English & Arabic support** / دعم اللغة العربية والإنجليزية\n"
+        "• **PDF upload** / رفع ملفات PDF\n"
+        "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
+        "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
+        "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
+        "⚠️ **Note**: First run may take 2-3 minutes to load models. Be patient!"
     )
     with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                lines=10,
+                label="📝 Paste your text / الصق نصك هنا",
+                placeholder="Paste lecture notes, article, research paper...\nالصق ملاحظات المحاضرة، مقال، ورقة بحثية...",
+            )
+            file_input = gr.File(
+                label="📎 Or upload PDF / أو ارفع ملف PDF",
+                file_types=[".pdf"]
+            )
+        with gr.Column():
+            language_choice = gr.Radio(
+                choices=["Auto-detect", "English", "Arabic / عربي"],
+                value="Auto-detect",
+                label="🌐 Language / اللغة"
+            )
+            length_choice = gr.Radio(
+                choices=["Short (25%)", "Medium (50%)", "Long (75%)"],
+                value="Medium (50%)",
+                label="📏 Summary Length / طول الملخص",
+                info="Short = concise, Long = detailed"
+            )
+    summarize_btn = gr.Button("✨ Summarize & Generate Questions", variant="primary", size="lg")
     output = gr.Textbox(
+        lines=20,
+        label="📋 Summary + Study Questions / الملخص + الأسئلة الدراسية",
         interactive=False
     )
     download_output = gr.File(
+        label="💾 Download Result (.txt) / تحميل النتيجة",
         interactive=False
     )
     summarize_btn.click(
         fn=process_input,
+        inputs=[text_input, file_input, length_choice, language_choice],
         outputs=[output, download_output]
     )
+    gr.Markdown(
+        "---\n"
+        "### Tips for best results:\n"
+        "• For Arabic text, select 'Arabic' language for better results\n"
+        "• Longer texts work better (500+ words)\n"
+        "• PDF quality affects extraction - clear text works best\n\n"
+        "### نصائح لأفضل النتائج:\n"
+        "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
+        "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
+        "• جودة PDF تؤثر على الاستخراج - النص الواضح يعمل بشكل أفضل"
+    )
 demo.launch()