Spaces:

Tech-di
/

WallTD-v.1

Sleeping

App Files Files Community

yousbek commited on Apr 5, 2025

Commit

210c948

verified ·

1 Parent(s): 977767b

Update main.py

Browse files

Files changed (1) hide show

main.py +74 -77

main.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import (
 )
 from utils import extract_text, save_file, verify_summary, ensure_complete_sentences
 from utils import convert_to_text, save_translated_file
-from langdetect import detect, DetectorFactory, LangDetectException
 from langcodes import Language
 import torch
 from huggingface_hub import InferenceClient
@@ -103,14 +103,15 @@ def split_text(text, max_tokens=900):
         chunks.append(summary_tokenizer.convert_tokens_to_string(current_chunk))
     return chunks
 # Document & Image Analysis (Summarization & Interpretation)
 @app.post("/docsum_imginter")
 async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
     file_path = UPLOAD_DIR / file.filename
     output_filename = f"summarized_{file.filename}"
     output_path = PROCESSED_DIR / output_filename
     with open(file_path, "wb") as f:
         shutil.copyfileobj(file.file, f)
@@ -122,21 +123,21 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
         if len(text.strip().split()) < 150:
             raise HTTPException(
-                    400,
-                    "WallD thinks the file is too small for summarization - minimum 150 words",
-                )
         text = text.encode("ascii", "ignore").decode("ascii")
         chunks = split_text(text)
         summaries = []
         prompt = (
-                "Generate a concise, factual summary covering ALL key sections of the text. "
-                "Include: main objectives, critical details, and outcomes if mentioned. "
-                "Never include: contact information, website links, or promotional content. "
-                "\n"
-                "Text to summarize:\n{chunk}"
-            )
         for chunk in chunks:
             word_count = len(chunk.split())
@@ -145,21 +146,21 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
                 continue
             max_length = min(
-                    max(int(word_count * 0.4), 150),
-                    512,
-                )
             summary_result = summarizer(
-                    prompt.format(chunk=chunk),
-                    max_length=max_length,
-                    min_length=max(150, int(max_length * 0.6)),
-                    do_sample=False,
-                    truncation=True,
-                    repetition_penalty=1.5,
-                    no_repeat_ngram_size=3,
-                    early_stopping=False,
-                    num_beams=4,
-                    length_penalty=1.0,
-                )
             if summary_result:
                 raw_summary = summary_result[0]["summary_text"]
@@ -170,46 +171,46 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
         if not summaries:
             raise HTTPException(
-                    500, "Summary verification failed - no valid content extracted"
-                )
         full_summary = "\n".join(filter(None, summaries))
         if len(summaries) > 1:
             full_summary = summarizer(
-                    f"Combine these partial summaries into one coherent paragraph:\n{full_summary}",
-                    max_length=512,
-                )[0]["summary_text"]
         if not full_summary.strip():
             sentences = [s.strip() for s in text.split(".") if s.strip()]
             full_summary = (
-                    ". ".join(sentences[:3]) + "." if sentences else text[:500]
-                )
         save_file(full_summary, file_path, file_type, output_path)
         return FileResponse(output_path, filename=output_filename)
     elif task.lower() == "interpret":
-        try:
             with Image.open(file_path) as image:
                 if image.mode != "RGB":
                     image = image.convert("RGB")
                 inputs = processor(images=image, return_tensors="pt")
                 if inputs is None or "pixel_values" not in inputs:
                     raise ValueError("Image processing failed: No valid inputs generated.")
                 outputs = interpretation_model.generate(**inputs, repetition_penalty=1.2)
                 if outputs is None:
                     raise ValueError("Model generation failed: No outputs produced.")
                 caption = processor.decode(outputs[0], skip_special_tokens=True)
                 return {"caption": caption if caption else "No caption generated"}
         except Exception as e:
             raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
         finally:
             if file_path.exists():
                 file_path.unlink()
@@ -221,7 +222,6 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
     try:
         file_type = file.filename.split(".")[-1].lower()
         file_path = UPLOAD_DIR / file.filename
         with open(file_path, "wb") as f:
             shutil.copyfileobj(file.file, f)
@@ -244,7 +244,6 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
                 detail="The File doesn't contain any text.",
             )
         result = question_answering(question=question, context=text)
         return {"answer": result["answer"]}
@@ -252,11 +251,10 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
         raise HTTPException(
             status_code=500, detail=f"Error processing request: {str(e)}"
         )
     finally:
         if file_path.exists():
             file_path.unlink()
 # Data Visualization Code Generation
@@ -318,10 +316,10 @@ async def visualization(file: UploadFile = File(...), request: str = Form(...)):
         exec(executable_code, exec_globals)
         plt.savefig(plot_path, bbox_inches="tight")
         plt.close()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error executing code: {str(e)}")
     finally:
         if file_path.exists():
             file_path.unlink()
@@ -335,43 +333,38 @@ async def visualization(file: UploadFile = File(...), request: str = Form(...)):
 # Text Translation
-LANGUAGE_CODE_MAPPING = {
-    'french': 'fr','fr': 'fr',
-    'english': 'en','en': 'en',
-    'spanish': 'es','es': 'es',
-    'german': 'de','de': 'de',
-    'italian': 'it', 'it': 'it',
-    'portuguese': 'pt','pt': 'pt',
-    'dutch': 'nl','nl': 'nl',
-    'russian': 'ru','ru': 'ru',
-    'chinese': 'zh','zh': 'zh',
-    'japanese': 'ja','ja': 'ja',
-    'arabic': 'ar','ar': 'ar',
-}
 async def translate_large_text(text, target_lang="fr"):
-    # Détection automatique de la langue source
-    try:
-        src_lang = detect(text)
-        # Si la détection échoue ou retourne un code incorrect, on utilise l'anglais par défaut
-        if not src_lang or len(src_lang) != 2:
-            src_lang = "en"
-    except:
-        src_lang = "en"
-    # Normalisation du code de langue cible
-    target_lang = target_lang.lower().strip()
-    target_lang = LANGUAGE_CODE_MAPPING.get(target_lang, target_lang[:2])
     chunks = split_tran_text_trans(text)
     translated_chunks = []
-    try:
-        translation_tokenizer.src_lang = src_lang
-    except:
-        # Si la langue source n'est pas supportée, on utilise l'anglais par défaut
-        translation_tokenizer.src_lang = "en"
     for chunk in chunks:
         try:
@@ -385,14 +378,15 @@ async def translate_large_text(text, target_lang="fr"):
             translated_chunks.append(translated)
         except Exception as e:
             print(f"Error translating chunk: {str(e)}")
-            translated_chunks.append(chunk)  # keep the original in case of error
     return "\n\n".join(translated_chunks)
 @app.post("/translate")
 async def translate_document(
         file: UploadFile = File(...),
-        target_language: str = "fr",  # peut être "french", "français", "fr", etc.
 ):
     try:
         text = await convert_to_text(file)
@@ -414,3 +408,6 @@ async def translate_document(
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 )
 from utils import extract_text, save_file, verify_summary, ensure_complete_sentences
 from utils import convert_to_text, save_translated_file
+from langdetect import detect, DetectorFactory
 from langcodes import Language
 import torch
 from huggingface_hub import InferenceClient
         chunks.append(summary_tokenizer.convert_tokens_to_string(current_chunk))
     return chunks
 # Document & Image Analysis (Summarization & Interpretation)
 @app.post("/docsum_imginter")
 async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
     file_path = UPLOAD_DIR / file.filename
     output_filename = f"summarized_{file.filename}"
     output_path = PROCESSED_DIR / output_filename
     with open(file_path, "wb") as f:
         shutil.copyfileobj(file.file, f)
         if len(text.strip().split()) < 150:
             raise HTTPException(
+                400,
+                "WallD thinks the file is too small for summarization - minimum 150 words",
+            )
         text = text.encode("ascii", "ignore").decode("ascii")
         chunks = split_text(text)
         summaries = []
         prompt = (
+            "Generate a concise, factual summary covering ALL key sections of the text. "
+            "Include: main objectives, critical details, and outcomes if mentioned. "
+            "Never include: contact information, website links, or promotional content. "
+            "\n"
+            "Text to summarize:\n{chunk}"
+        )
         for chunk in chunks:
             word_count = len(chunk.split())
                 continue
             max_length = min(
+                max(int(word_count * 0.4), 150),
+                512,
+            )
             summary_result = summarizer(
+                prompt.format(chunk=chunk),
+                max_length=max_length,
+                min_length=max(150, int(max_length * 0.6)),
+                do_sample=False,
+                truncation=True,
+                repetition_penalty=1.5,
+                no_repeat_ngram_size=3,
+                early_stopping=False,
+                num_beams=4,
+                length_penalty=1.0,
+            )
             if summary_result:
                 raw_summary = summary_result[0]["summary_text"]
         if not summaries:
             raise HTTPException(
+                500, "Summary verification failed - no valid content extracted"
+            )
         full_summary = "\n".join(filter(None, summaries))
         if len(summaries) > 1:
             full_summary = summarizer(
+                f"Combine these partial summaries into one coherent paragraph:\n{full_summary}",
+                max_length=512,
+            )[0]["summary_text"]
         if not full_summary.strip():
             sentences = [s.strip() for s in text.split(".") if s.strip()]
             full_summary = (
+                ". ".join(sentences[:3]) + "." if sentences else text[:500]
+            )
         save_file(full_summary, file_path, file_type, output_path)
         return FileResponse(output_path, filename=output_filename)
     elif task.lower() == "interpret":
+        try:
             with Image.open(file_path) as image:
                 if image.mode != "RGB":
                     image = image.convert("RGB")
                 inputs = processor(images=image, return_tensors="pt")
                 if inputs is None or "pixel_values" not in inputs:
                     raise ValueError("Image processing failed: No valid inputs generated.")
                 outputs = interpretation_model.generate(**inputs, repetition_penalty=1.2)
                 if outputs is None:
                     raise ValueError("Model generation failed: No outputs produced.")
                 caption = processor.decode(outputs[0], skip_special_tokens=True)
                 return {"caption": caption if caption else "No caption generated"}
         except Exception as e:
             raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
         finally:
             if file_path.exists():
                 file_path.unlink()
     try:
         file_type = file.filename.split(".")[-1].lower()
         file_path = UPLOAD_DIR / file.filename
         with open(file_path, "wb") as f:
             shutil.copyfileobj(file.file, f)
                 detail="The File doesn't contain any text.",
             )
         result = question_answering(question=question, context=text)
         return {"answer": result["answer"]}
         raise HTTPException(
             status_code=500, detail=f"Error processing request: {str(e)}"
         )
     finally:
         if file_path.exists():
             file_path.unlink()
 # Data Visualization Code Generation
         exec(executable_code, exec_globals)
         plt.savefig(plot_path, bbox_inches="tight")
         plt.close()
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error executing code: {str(e)}")
     finally:
         if file_path.exists():
             file_path.unlink()
 # Text Translation
+def split_tran_text_trans(text, max_chunk_size=400):
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    paragraphs = re.split(r'\n\n+', text)
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        if current_length + len(para.split()) <= max_chunk_size:
+            current_chunk.append(para)
+            current_length += len(para.split())
+        else:
+            if current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+            current_chunk = [para]
+            current_length = len(para.split())
+    if current_chunk:
+        chunks.append("\n\n".join(current_chunk))
+    return chunks
 async def translate_large_text(text, target_lang="fr"):
     chunks = split_tran_text_trans(text)
     translated_chunks = []
+    translation_tokenizer.src_lang = "en"
     for chunk in chunks:
         try:
             translated_chunks.append(translated)
         except Exception as e:
             print(f"Error translating chunk: {str(e)}")
+            translated_chunks.append(chunk)  # keep the original in case u got an error
     return "\n\n".join(translated_chunks)
 @app.post("/translate")
 async def translate_document(
         file: UploadFile = File(...),
+        target_language: str = "fr",  # default target language
 ):
     try:
         text = await convert_to_text(file)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))