ghizlaneimane commited on
Commit
ed7302a
·
verified ·
1 Parent(s): 368b7ae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -0
app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ from transformers import pipeline
4
+ from typing import Optional
5
+ import io
6
+ from PIL import Image
7
+ import tempfile
8
+ import os
9
+ import fitz # PyMuPDF
10
+ import docx
11
+ import pandas as pd
12
+ import pptx
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from langdetect import detect
15
+ from fastapi.staticfiles import StaticFiles
16
+ from fastapi.responses import HTMLResponse
17
+ from fastapi import Request
18
+ from fastapi.templating import Jinja2Templates
19
+ app = FastAPI()
20
+
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"],
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # Liste des langues supportées
30
+ SUPPORTED_LANGUAGES = ["fr", "en", "de", "es", "it", "zh", "ar"]
31
+
32
+ # Modèles de traduction valides (existants sur Hugging Face)
33
+ translation_models = {
34
+ "fr-en": "Helsinki-NLP/opus-mt-fr-en",
35
+ "en-fr": "Helsinki-NLP/opus-mt-en-fr",
36
+ "fr-de": "Helsinki-NLP/opus-mt-fr-de",
37
+ "de-fr": "Helsinki-NLP/opus-mt-de-fr",
38
+ "fr-es": "Helsinki-NLP/opus-mt-fr-es",
39
+ "es-fr": "Helsinki-NLP/opus-mt-es-fr",
40
+ "en-zh": "Helsinki-NLP/opus-mt-en-zh",
41
+ "zh-en": "Helsinki-NLP/opus-mt-zh-en",
42
+ "en-it": "Helsinki-NLP/opus-mt-en-it",
43
+ "it-en": "Helsinki-NLP/opus-mt-it-en",
44
+ "en-ar": "Helsinki-NLP/opus-mt-en-ar",
45
+ "ar-en": "Helsinki-NLP/opus-mt-ar-en",
46
+ "en-es": "Helsinki-NLP/opus-mt-en-es",
47
+ "en-de": "Helsinki-NLP/opus-mt-en-de",
48
+ "es-ar": "Helsinki-NLP/opus-mt-es-ar",
49
+ "es-en": "Helsinki-NLP/opus-mt-es-en",
50
+ "es-de": "Helsinki-NLP/opus-mt-es-de",
51
+ "es-it": "Helsinki-NLP/opus-mt-es-it",
52
+ "es-zh": "Helsinki-NLP/opus-mt-es-zh",
53
+ "ar-fr": "Helsinki-NLP/opus-mt-ar-fr",
54
+ "ar-de": "Helsinki-NLP/opus-mt-ar-de",
55
+ "ar-es": "Helsinki-NLP/opus-mt-ar-es",
56
+ "ar-it": "Helsinki-NLP/opus-mt-ar-it",
57
+ "ar-zh": "Helsinki-NLP/opus-mt-ar-zh",
58
+ "de-en": "Helsinki-NLP/opus-mt-de-en",
59
+ "de-de": "Helsinki-NLP/opus-mt-de-de",
60
+ "de-es": "Helsinki-NLP/opus-mt-de-es",
61
+ "de-it": "Helsinki-NLP/opus-mt-de-it",
62
+ "de-zh": "Helsinki-NLP/opus-mt-de-zh",
63
+ "de-ar": "Helsinki-NLP/opus-mt-de-ar",
64
+ "it-fr": "Helsinki-NLP/opus-mt-it-fr",
65
+ "it-de": "Helsinki-NLP/opus-mt-it-de",
66
+ "it-es": "Helsinki-NLP/opus-mt-it-es",
67
+ "it-zh": "Helsinki-NLP/opus-mt-it-zh",
68
+ "it-ar": "Helsinki-NLP/opus-mt-it-ar",
69
+ "zh-fr": "Helsinki-NLP/opus-mt-zh-fr",
70
+ "zh-de": "Helsinki-NLP/opus-mt-zh-en",
71
+ "zh-it": "Helsinki-NLP/opus-mt-zh-it",
72
+ "zh-es": "Helsinki-NLP/opus-mt-zh-es",
73
+ "zh-ar": "Helsinki-NLP/opus-mt-zh-ar",
74
+
75
+
76
+ }
77
+
78
+ def extract_text_from_pdf(file_path):
79
+ text = ""
80
+ with fitz.open(file_path) as doc:
81
+ for page in doc:
82
+ text += page.get_text("text") + "\n"
83
+ return text
84
+
85
+ def extract_text_from_docx(file_path):
86
+ doc = docx.Document(file_path)
87
+ return "\n".join([p.text for p in doc.paragraphs])
88
+
89
+ def extract_text_from_pptx(file_path):
90
+ presentation = pptx.Presentation(file_path)
91
+ text = []
92
+ for slide in presentation.slides:
93
+ for shape in slide.shapes:
94
+ if hasattr(shape, "text"):
95
+ text.append(shape.text)
96
+ return "\n".join(text)
97
+
98
+ def extract_text_from_excel(file_path):
99
+ df = pd.read_excel(file_path, engine="openpyxl")
100
+ return df.to_string(index=False)
101
+
102
+ def chunk_text(text, max_length=512):
103
+ words = text.split()
104
+ chunks, current_chunk = [], []
105
+
106
+ for word in words:
107
+ if len(" ".join(current_chunk) + " " + word) <= max_length:
108
+ current_chunk.append(word)
109
+ else:
110
+ chunks.append(" ".join(current_chunk))
111
+ current_chunk = [word]
112
+
113
+ if current_chunk:
114
+ chunks.append(" ".join(current_chunk))
115
+
116
+ return chunks
117
+
118
+ def translate_text(text, source_lang, target_lang):
119
+ if source_lang not in SUPPORTED_LANGUAGES or target_lang not in SUPPORTED_LANGUAGES:
120
+ return None # Langue non supportée
121
+
122
+ model_key = f"{source_lang}-{target_lang}"
123
+ if model_key in translation_models:
124
+ model_name = translation_models[model_key]
125
+ translator = pipeline("translation", model=model_name)
126
+ translated_chunks = [translator(chunk)[0]["translation_text"] for chunk in chunk_text(text)]
127
+ return " ".join(translated_chunks)
128
+
129
+ # Si pas de traduction directe, utiliser l'anglais comme pivot
130
+ model_to_en = f"{source_lang}-en"
131
+ model_from_en = f"en-{target_lang}"
132
+
133
+ if model_to_en in translation_models and model_from_en in translation_models:
134
+ translator_to_en = pipeline("translation", model=translation_models[model_to_en])
135
+ translator_from_en = pipeline("translation", model=translation_models[model_from_en])
136
+
137
+ intermediate_texts = [translator_to_en(chunk)[0]["translation_text"] for chunk in chunk_text(text)]
138
+ intermediate_text = " ".join(intermediate_texts)
139
+
140
+ final_texts = [translator_from_en(chunk)[0]["translation_text"] for chunk in chunk_text(intermediate_text)]
141
+ return " ".join(final_texts)
142
+
143
+ return None # Pas de modèle disponible
144
+
145
+ templates = Jinja2Templates(directory="templates")
146
+ # Mount static files (CSS, JS, images)
147
+ app.mount("/static", StaticFiles(directory="static"), name="static")
148
+
149
+ @app.get("/")
150
+ async def read_root(request: Request):
151
+ return templates.TemplateResponse("prj.html", {"request": request})
152
+
153
+ @app.post("/translate")
154
+
155
+ async def translate_document(file: UploadFile = File(...), language: str = Form(...)):
156
+ try:
157
+ suffix = file.filename.split(".")[-1].lower()
158
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}")
159
+ temp_file.write(await file.read())
160
+ temp_file.close()
161
+
162
+ extractors = {
163
+ "pdf": extract_text_from_pdf,
164
+ "docx": extract_text_from_docx,
165
+ "pptx": extract_text_from_pptx,
166
+ "xls": extract_text_from_excel,
167
+ "xlsx": extract_text_from_excel
168
+ }
169
+
170
+ if suffix not in extractors:
171
+ return JSONResponse({"error": "Format non supporté"}, status_code=400)
172
+
173
+ text = extractors[suffix](temp_file.name)
174
+ os.remove(temp_file.name)
175
+
176
+ if not text.strip():
177
+ return JSONResponse({"error": "Aucun texte détecté"}, status_code=400)
178
+
179
+ detected_lang = detect(text)
180
+ if detected_lang not in SUPPORTED_LANGUAGES:
181
+ return JSONResponse({"error": f"Langue non supportée : {detected_lang}"}, status_code=400)
182
+
183
+ if detected_lang == language:
184
+ return JSONResponse({"translation": text, "note": "Déjà dans la langue choisie."})
185
+
186
+ translated_text = translate_text(text, detected_lang, language)
187
+ if translated_text:
188
+ return JSONResponse({"translation": translated_text})
189
+ else:
190
+ return JSONResponse({"error": "Aucun modèle de traduction trouvé."}, status_code=400)
191
+
192
+ except Exception as e:
193
+ return JSONResponse({"error": str(e)}, status_code=500)