Spaces:

Azidan
/

textSum

Sleeping

App Files Files Community

Azidan commited on 13 days ago

Commit

430cffe

verified ·

1 Parent(s): fdc9079

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -32

app.py CHANGED Viewed

@@ -8,19 +8,16 @@ import torch
 # =========================
 # Model setup (CPU-safe, Multi-language)
 # =========================
-# Use different models for English and Arabic
-EN_SUMMARIZER_MODEL = "sshleifer/distilbart-cnn-12-6"  # English summarization
 AR_SUMMARIZER_MODEL = "csebuetnlp/mT5_multilingual_XLSum"  # Multilingual (includes Arabic)
 QA_MODEL = "google/flan-t5-small"  # Question generation
 print("Loading models... This may take a minute on first run.")
-# English summarizer
-en_summarizer = pipeline(
-    "summarization",
-    model=EN_SUMMARIZER_MODEL,
-    device=-1  # CPU only
-)
 # Multilingual summarizer (for Arabic and other languages)
 ar_tokenizer = AutoTokenizer.from_pretrained(AR_SUMMARIZER_MODEL)
@@ -33,7 +30,7 @@ question_generator = pipeline(
     device=-1  # CPU only
 )
-CHUNK_SIZE = 512  # Conservative chunk size
 # =========================
 # Language Detection
@@ -68,10 +65,7 @@ def clean_text(text: str) -> str:
 def chunk_text(text: str, language: str):
     """Token-aware chunking to avoid model overflow."""
     # Use appropriate tokenizer based on language
-    if language == "ar_AR":
-        tokenizer = ar_tokenizer
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(EN_SUMMARIZER_MODEL)
     tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
@@ -180,17 +174,17 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
     if not text or len(text.strip()) == 0:
         return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
-    # Length mapping
     length_map = {
-        "Short (25%)": {"max": 150, "min": 40},
-        "Medium (50%)": {"max": 250, "min": 80},
-        "Long (75%)": {"max": 400, "min": 120},
-        "قصير (25%)": {"max": 150, "min": 40},
-        "متوسط (50%)": {"max": 250, "min": 80},
-        "طويل (75%)": {"max": 400, "min": 120}
     }
-    length_params = length_map.get(summary_length, {"max": 250, "min": 80})
     progress(0, desc="Extracting headings...")
     headings_section = extract_possible_headings(text)
@@ -205,7 +199,7 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
         chunk = chunks[i]
         try:
             if language == "ar_AR":
-                # Use mT5 for Arabic
                 inputs = ar_tokenizer(chunk, return_tensors="pt", max_length=512, truncation=True)
                 summary_ids = ar_model.generate(
                     inputs["input_ids"],
@@ -217,26 +211,34 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
                 )
                 summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
             else:
-                # Use distilbart for English
-                summary = en_summarizer(
-                    chunk,
                     max_length=length_params["max"],
                     min_length=length_params["min"],
-                    do_sample=False
-                )[0]["summary_text"]
             cleaned = clean_text(summary)
-            chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
-            summaries.append(f"{chunk_label} {cleaned}")
         except Exception as e:
             print(f"Error in chunk {i}: {str(e)}")
-            pass  # skip problematic chunks
     # Format summaries
     header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
     summary_md = header
-    for s in summaries:
-        summary_md += f"- {s}\n"
     progress(0.8, desc="Generating questions...")
     questions = generate_questions(summary_md, language)

 # =========================
 # Model setup (CPU-safe, Multi-language)
 # =========================
+# Use T5-based models that support text2text-generation
+EN_SUMMARIZER_MODEL = "google/flan-t5-base"  # English - works with text2text
 AR_SUMMARIZER_MODEL = "csebuetnlp/mT5_multilingual_XLSum"  # Multilingual (includes Arabic)
 QA_MODEL = "google/flan-t5-small"  # Question generation
 print("Loading models... This may take a minute on first run.")
+# English summarizer using text2text-generation
+en_tokenizer = AutoTokenizer.from_pretrained(EN_SUMMARIZER_MODEL)
+en_model = AutoModelForSeq2SeqLM.from_pretrained(EN_SUMMARIZER_MODEL)
 # Multilingual summarizer (for Arabic and other languages)
 ar_tokenizer = AutoTokenizer.from_pretrained(AR_SUMMARIZER_MODEL)
     device=-1  # CPU only
 )
+CHUNK_SIZE = 400  # Conservative chunk size for T5 models
 # =========================
 # Language Detection
 def chunk_text(text: str, language: str):
     """Token-aware chunking to avoid model overflow."""
     # Use appropriate tokenizer based on language
+    tokenizer = ar_tokenizer if language == "ar_AR" else en_tokenizer
     tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
     if not text or len(text.strip()) == 0:
         return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
+    # Length mapping (for T5 models, these are approximate)
     length_map = {
+        "Short (25%)": {"max": 128, "min": 30},
+        "Medium (50%)": {"max": 256, "min": 60},
+        "Long (75%)": {"max": 400, "min": 100},
+        "قصير (25%)": {"max": 128, "min": 30},
+        "متوسط (50%)": {"max": 256, "min": 60},
+        "طويل (75%)": {"max": 400, "min": 100}
     }
+    length_params = length_map.get(summary_length, {"max": 256, "min": 60})
     progress(0, desc="Extracting headings...")
     headings_section = extract_possible_headings(text)
         chunk = chunks[i]
         try:
             if language == "ar_AR":
+                # Use mT5 for Arabic with direct model inference
                 inputs = ar_tokenizer(chunk, return_tensors="pt", max_length=512, truncation=True)
                 summary_ids = ar_model.generate(
                     inputs["input_ids"],
                 )
                 summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
             else:
+                # Use FLAN-T5 for English with summarization prompt
+                prompt = f"Summarize the following text in detail:\n\n{chunk}"
+                inputs = en_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+                summary_ids = en_model.generate(
+                    inputs["input_ids"],
                     max_length=length_params["max"],
                     min_length=length_params["min"],
+                    num_beams=4,
+                    early_stopping=True
+                )
+                summary = en_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
             cleaned = clean_text(summary)
+            if cleaned:  # Only add non-empty summaries
+                chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
+                summaries.append(f"{chunk_label} {cleaned}")
         except Exception as e:
             print(f"Error in chunk {i}: {str(e)}")
+            continue  # skip problematic chunks
     # Format summaries
     header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
     summary_md = header
+    if summaries:
+        for s in summaries:
+            summary_md += f"- {s}\n"
+    else:
+        summary_md += "Unable to generate summary. Please try with different text.\n"
     progress(0.8, desc="Generating questions...")
     questions = generate_questions(summary_md, language)