Spaces:

SamiKLN
/

DocImageAI-Explorer

Sleeping

App Files Files Community

SamiKLN commited on Apr 27, 2025

Commit

d59b7f5

verified ·

1 Parent(s): db0f953

Update main.py

Browse files

Files changed (1) hide show

main.py +38 -73

main.py CHANGED Viewed

@@ -43,7 +43,7 @@ client = InferenceClient(token=HF_TOKEN)
 MODELS = {
     "summary": "facebook/bart-large-cnn",
     "caption": "Salesforce/blip-image-captioning-large",
-    "qa": "google/flan-t5-base"  # Remplacé par un modèle plus petit compatible
 }
 # Modèles Pydantic
@@ -107,13 +107,6 @@ def extract_text_from_excel(file_path: str) -> str:
         logger.error(f"Excel extraction error: {e}")
         raise HTTPException(400, "Erreur d'extraction Excel")
-# Fonction helper pour trouver un fichier par ID sans récursion
-def find_file_by_id(file_id: str):
-    for file_path in UPLOAD_FOLDER.iterdir():
-        if file_path.name.startswith(file_id):
-            return file_path
-    return None
 async def process_uploaded_file(file: UploadFile) -> FileInfo:
     file_ext = Path(file.filename).suffix.lower()
     file_id = str(uuid.uuid4())
@@ -169,40 +162,31 @@ async def upload_files(files: List[UploadFile] = File(...)):
 @app.post("/api/summarize")
 async def summarize_document(request: SummaryRequest):
     try:
-        file_path = find_file_by_id(request.file_id)
-        if not file_path:
-            raise HTTPException(404, "Fichier non trouvé")
         text = ""
         if file_path.suffix == ".pdf":
             text = extract_text_from_pdf(str(file_path))
-        elif file_path.suffix == ".docx":
-            text = extract_text_from_docx(str(file_path))
-        elif file_path.suffix == ".pptx":
-            text = extract_text_from_pptx(str(file_path))
-        elif file_path.suffix in (".xlsx", ".xls"):
-            text = extract_text_from_excel(str(file_path))
         else:
-            try:
-                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                    text = f.read()
-            except Exception as e:
-                logger.error(f"Error reading file: {e}")
-                raise HTTPException(400, "Erreur de lecture du fichier")
-        # Limiter la taille pour éviter des erreurs avec l'API
-        text_input = text[:4000] + ("..." if len(text) > 4000 else "")
         summary = client.summarization(
-            text=text_input,
             model=MODELS["summary"],
             parameters={"max_length": request.max_length}
         )
         return {"summary": summary}
-    except HTTPException as e:
-        raise e
     except Exception as e:
         logger.error(f"Summarization error: {e}")
         raise HTTPException(500, f"Erreur de résumé: {str(e)}")
@@ -210,12 +194,7 @@ async def summarize_document(request: SummaryRequest):
 @app.post("/api/caption")
 async def caption_image(request: CaptionRequest):
     try:
-        file_path = find_file_by_id(request.file_id)
-        if not file_path:
-            raise HTTPException(404, "Fichier non trouvé")
-        if not file_path.suffix.lower() in [".jpg", ".jpeg", ".png", ".gif"]:
-            raise HTTPException(400, "Le fichier doit être une image (jpg, png, gif)")
         with open(file_path, "rb") as image_file:
             image_data = image_file.read()
@@ -226,8 +205,6 @@ async def caption_image(request: CaptionRequest):
         )
         return {"caption": caption}
-    except HTTPException as e:
-        raise e
     except Exception as e:
         logger.error(f"Captioning error: {e}")
         raise HTTPException(500, f"Erreur de description: {str(e)}")
@@ -237,42 +214,34 @@ async def answer_question(request: QARequest):
     try:
         context = ""
         if request.file_id:
-            file_path = find_file_by_id(request.file_id)
-            if file_path:
-                if file_path.suffix.lower() in (".jpg", ".jpeg", ".png", ".gif"):
-                    with open(file_path, "rb") as image_file:
-                        image_data = image_file.read()
-                    context = client.image_to_text(image=image_data, model=MODELS["caption"])
-                else:
-                    if file_path.suffix == ".pdf":
-                        context = extract_text_from_pdf(str(file_path))
-                    elif file_path.suffix == ".docx":
-                        context = extract_text_from_docx(str(file_path))
-                    elif file_path.suffix == ".pptx":
-                        context = extract_text_from_pptx(str(file_path))
-                    elif file_path.suffix in (".xlsx", ".xls"):
-                        context = extract_text_from_excel(str(file_path))
-                    else:
-                        try:
-                            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                                context = f.read()
-                        except Exception as e:
-                            logger.error(f"Error reading file: {e}")
-                            context = ""
             else:
-                logger.warning(f"File not found: {request.file_id}")
-        # Adapter le prompt au format attendu par le modèle T5
-        inputs = f"answer question: {request.question} context: {context[:1500]}"
-        response = client.text_generation(
-            prompt=inputs,
             model=MODELS["qa"],
-            max_new_tokens=250,
-            temperature=0.7
         )
-        return {"answer": response}
     except Exception as e:
         logger.error(f"QA error: {e}")
         raise HTTPException(500, f"Erreur de réponse: {str(e)}")
@@ -280,15 +249,11 @@ async def answer_question(request: QARequest):
 @app.get("/api/file/{file_id}")
 async def get_file(file_id: str):
     try:
-        file_path = find_file_by_id(file_id)
-        if not file_path:
-            raise HTTPException(404, "Fichier non trouvé")
         return FileResponse(file_path)
-    except HTTPException as e:
-        raise e
     except Exception as e:
         logger.error(f"File retrieval error: {e}")
-        raise HTTPException(500, f"Erreur lors de la récupération du fichier: {str(e)}")
 # Gestion des erreurs
 @app.exception_handler(HTTPException)

 MODELS = {
     "summary": "facebook/bart-large-cnn",
     "caption": "Salesforce/blip-image-captioning-large",
+    "qa": "deepseek-ai/DeepSeek-V2-Chat"
 }
 # Modèles Pydantic
         logger.error(f"Excel extraction error: {e}")
         raise HTTPException(400, "Erreur d'extraction Excel")
 async def process_uploaded_file(file: UploadFile) -> FileInfo:
     file_ext = Path(file.filename).suffix.lower()
     file_id = str(uuid.uuid4())
 @app.post("/api/summarize")
 async def summarize_document(request: SummaryRequest):
     try:
+        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
         text = ""
         if file_path.suffix == ".pdf":
             text = extract_text_from_pdf(str(file_path))
         else:
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+        prompt = f"""
+        Résumez ce document de manière concise en français.
+        Concentrez-vous sur les points principaux.
+        Le résumé doit faire environ {request.max_length} mots.
+        Document:
+        {text[:5000]}... [truncated]
+        """
         summary = client.summarization(
+            text=text,
             model=MODELS["summary"],
             parameters={"max_length": request.max_length}
         )
         return {"summary": summary}
     except Exception as e:
         logger.error(f"Summarization error: {e}")
         raise HTTPException(500, f"Erreur de résumé: {str(e)}")
 @app.post("/api/caption")
 async def caption_image(request: CaptionRequest):
     try:
+        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
         with open(file_path, "rb") as image_file:
             image_data = image_file.read()
         )
         return {"caption": caption}
     except Exception as e:
         logger.error(f"Captioning error: {e}")
         raise HTTPException(500, f"Erreur de description: {str(e)}")
     try:
         context = ""
         if request.file_id:
+            file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
+            if file_path.suffix in (".jpg", ".jpeg", ".png"):
+                with open(file_path, "rb") as image_file:
+                    image_data = image_file.read()
+                context = client.image_to_text(image=image_data, model=MODELS["caption"])
             else:
+                if file_path.suffix == ".pdf":
+                    context = extract_text_from_pdf(str(file_path))
+                else:
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        context = f.read()
+        prompt = f"""
+        Vous êtes un assistant IA qui répond à des questions en français.
+        Répondez de manière précise et concise.
+        Contexte: {context[:3000]}
+        Question: {request.question}
+        Réponse:
+        """
+        response = client.chat_completion(
             model=MODELS["qa"],
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=500
         )
+        return {"answer": response.choices[0].message.content}
     except Exception as e:
         logger.error(f"QA error: {e}")
         raise HTTPException(500, f"Erreur de réponse: {str(e)}")
 @app.get("/api/file/{file_id}")
 async def get_file(file_id: str):
     try:
+        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{file_id}*"))
         return FileResponse(file_path)
     except Exception as e:
         logger.error(f"File retrieval error: {e}")
+        raise HTTPException(404, "Fichier non trouvé")
 # Gestion des erreurs
 @app.exception_handler(HTTPException)