Spaces:

NOBODY204
/

ArchivChat

Sleeping

App Files Files Community

NOBODY204 commited on Mar 2

Commit

8f60d7b

verified ·

1 Parent(s): 85bd81a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -39

app.py CHANGED Viewed

@@ -5,15 +5,16 @@ import re
 import easyocr
 import pdf2image
 import numpy as np
-import os
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
 # -----------------------------
-# 1️⃣ OCR en arabe
 # -----------------------------
-reader = easyocr.Reader(['ar'])  # OCR pour l'arabe
-def ocr_pdf_arabic(file_path):
     pages = pdf2image.convert_from_path(file_path, dpi=300)
     text = ""
     for page in pages:
@@ -22,51 +23,40 @@ def ocr_pdf_arabic(file_path):
     return text
 # -----------------------------
-# 2️⃣ Logique de rangement (11 boîtes par étage)
 # -----------------------------
 def calculer_emplacement(n):
     try:
         if n is None or n < 1:
             return "### ⚠️ En attente d'un numéro de boîte..."
         n = int(n)
         etage = ((n - 1) // 11) + 1
         rayon = "A" if n <= 11 else "B"
         case = ((n - 1) % 11) + 1
         return f"""
-        # 📍 EMPLACEMENT TROUVÉ
-        ---
-        ## 📦 BOÎTE N° : {n}
-        ## 🏢 RAYON : **{rayon}**
-        ## 📏 ÉTAGE : **{etage}**
-        ## 🔢 CASE (Position) : **{case} / 11**
-        ---
-        *Instructions : Allez au Rayon {rayon}, montez à l'étage {etage} et prenez la {case}ème boîte.*
-        """
     except:
         return "❌ Erreur de saisie."
 # -----------------------------
-# 3️⃣ Modèle résumé fidèle arabe
 # -----------------------------
-print("Chargement du modèle mBART multilingue pour résumé arabe...")
-model_name = "facebook/mbart-large-50-many-to-many-mmt"
-tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
-model = MBartForConditionalGeneration.from_pretrained(model_name)
-def summarize_arabic(text, longueur):
-    # Tokenizer et génération
-    tokenizer.src_lang = "ar_AR"
-    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
-    max_len = 180 if longueur == "Détaillé" else 80
-    min_len = 50 if longueur == "Détaillé" else 20
-    summary_ids = model.generate(inputs["input_ids"], max_length=max_len, min_length=min_len, do_sample=False)
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    return summary
 # -----------------------------
-# 4️⃣ Classe d'analyse PDF
 # -----------------------------
 class ArchivAppV15:
     def analyser_pdf(self, file, longueur):
@@ -80,15 +70,16 @@ class ArchivAppV15:
                 if content:
                     text += content + " "
-            # Si texte trop court → OCR arabe
             if len(text.strip()) < 50:
-                text = ocr_pdf_arabic(file.name)
             clean_text = re.sub(r"\s+", " ", text).strip()
             if len(clean_text) < 50:
                 return "❌ Document trop court après OCR", ""
-            summary = summarize_arabic(clean_text, longueur)
             return "✅ Synthèse réussie", summary
         except Exception as e:
@@ -100,7 +91,7 @@ app = ArchivAppV15()
 # 5️⃣ Interface Gradio
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📁 ArchivChat V15 - Gestion & Synthèse Arabe")
     with gr.Tab("📍 Localisation"):
         gr.Markdown("### Calcul automatique : 11 boîtes par étage")
@@ -109,9 +100,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         output_loc = gr.Markdown("### L'emplacement détaillé s'affichera ici")
         input_num.change(calculer_emplacement, inputs=input_num, outputs=output_loc)
-    with gr.Tab("📄 Résumé Arabe"):
         with gr.Row():
-            file_in = gr.File(label="Déposer le PDF d'archive (arabe)", file_types=[".pdf"])
             longueur_opt = gr.Radio(["Court", "Détaillé"], label="Style de résumé", value="Court")
         btn_res = gr.Button("Lancer l'analyse ✨", variant="primary")
         with gr.Row():

 import easyocr
 import pdf2image
 import numpy as np
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lex_rank import LexRankSummarizer
 # -----------------------------
+# 1️⃣ OCR arabe + français
 # -----------------------------
+reader = easyocr.Reader(['ar', 'fr'])
+def ocr_pdf_multilang(file_path):
     pages = pdf2image.convert_from_path(file_path, dpi=300)
     text = ""
     for page in pages:
     return text
 # -----------------------------
+# 2️⃣ Calcul emplacement (11 boîtes / étage)
 # -----------------------------
 def calculer_emplacement(n):
     try:
         if n is None or n < 1:
             return "### ⚠️ En attente d'un numéro de boîte..."
         n = int(n)
         etage = ((n - 1) // 11) + 1
         rayon = "A" if n <= 11 else "B"
         case = ((n - 1) % 11) + 1
         return f"""
+# 📍 EMPLACEMENT TROUVÉ
+---
+## 📦 BOÎTE N° : {n}
+## 🏢 RAYON : **{rayon}**
+## 📏 ÉTAGE : **{etage}**
+## 🔢 CASE (Position) : **{case} / 11**
+---
+*Instructions : Allez au Rayon {rayon}, montez à l'étage {etage} et prenez la {case}ème boîte.*
+"""
     except:
         return "❌ Erreur de saisie."
 # -----------------------------
+# 3️⃣ Résumé extractif fiable hors-ligne
 # -----------------------------
+def summarize_offline(text, n_sentences=5):
+    parser = PlaintextParser.from_string(text, Tokenizer("arabic"))
+    summarizer = LexRankSummarizer()
+    summary = summarizer(parser.document, sentences_count=n_sentences)
+    return " ".join([str(sentence) for sentence in summary])
 # -----------------------------
+# 4️⃣ Classe analyse PDF
 # -----------------------------
 class ArchivAppV15:
     def analyser_pdf(self, file, longueur):
                 if content:
                     text += content + " "
+            # Si texte trop court → OCR
             if len(text.strip()) < 50:
+                text = ocr_pdf_multilang(file.name)
             clean_text = re.sub(r"\s+", " ", text).strip()
             if len(clean_text) < 50:
                 return "❌ Document trop court après OCR", ""
+            n_sentences = 10 if longueur == "Détaillé" else 5
+            summary = summarize_offline(clean_text, n_sentences=n_sentences)
             return "✅ Synthèse réussie", summary
         except Exception as e:
 # 5️⃣ Interface Gradio
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📁 ArchivChat V15 - Gestion & Synthèse Arabe + Français")
     with gr.Tab("📍 Localisation"):
         gr.Markdown("### Calcul automatique : 11 boîtes par étage")
         output_loc = gr.Markdown("### L'emplacement détaillé s'affichera ici")
         input_num.change(calculer_emplacement, inputs=input_num, outputs=output_loc)
+    with gr.Tab("📄 Résumé Multilingue"):
         with gr.Row():
+            file_in = gr.File(label="Déposer le PDF d'archive (arabe ou français)", file_types=[".pdf"])
             longueur_opt = gr.Radio(["Court", "Détaillé"], label="Style de résumé", value="Court")
         btn_res = gr.Button("Lancer l'analyse ✨", variant="primary")
         with gr.Row():