SamiKLN commited on
Commit
dfd1eb1
·
verified ·
1 Parent(s): 663edf2

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +272 -272
main.py CHANGED
@@ -1,273 +1,273 @@
1
- import os
2
- import uuid
3
- import logging
4
- from pathlib import Path
5
- from typing import List, Optional
6
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
7
- from fastapi.staticfiles import StaticFiles
8
- from fastapi.templating import Jinja2Templates
9
- from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
10
- from fastapi.middleware.cors import CORSMiddleware
11
- from pydantic import BaseModel
12
- from huggingface_hub import InferenceClient
13
- import fitz # PyMuPDF
14
- from PIL import Image
15
- import io
16
- import pandas as pd
17
- from docx import Document
18
- from pptx import Presentation
19
-
20
- # Configuration du logging
21
- logging.basicConfig(level=logging.INFO)
22
- logger = logging.getLogger(__name__)
23
-
24
- # Initialisation de l'application FastAPI
25
- app = FastAPI()
26
-
27
- # Configuration CORS
28
- app.add_middleware(
29
- CORSMiddleware,
30
- allow_origins=["*"],
31
- allow_methods=["*"],
32
- allow_headers=["*"],
33
- )
34
-
35
- # Chemins des fichiers
36
- BASE_DIR = Path(__file__).parent.parent
37
- UPLOAD_FOLDER = BASE_DIR / "uploads"
38
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
39
-
40
- # Modèles Hugging Face
41
- HF_TOKEN = os.getenv("HF_TOKEN")
42
- client = InferenceClient(token=HF_TOKEN)
43
- MODELS = {
44
- "summary": "facebook/bart-large-cnn",
45
- "caption": "Salesforce/blip-image-captioning-large",
46
- "qa": "deepseek-ai/DeepSeek-V2-Chat"
47
- }
48
-
49
- # Modèles Pydantic
50
- class FileInfo(BaseModel):
51
- file_id: str
52
- file_name: str
53
- file_type: str
54
- file_path: str
55
- extracted_text: Optional[str] = None
56
-
57
- class SummaryRequest(BaseModel):
58
- file_id: str
59
- max_length: int = 150
60
-
61
- class CaptionRequest(BaseModel):
62
- file_id: str
63
-
64
- class QARequest(BaseModel):
65
- file_id: Optional[str] = None
66
- question: str
67
-
68
- # Initialisation des templates
69
- templates = Jinja2Templates(directory=str(BASE_DIR / "frontend/templates"))
70
- app.mount("/static", StaticFiles(directory=str(BASE_DIR / "frontend/static")), name="static")
71
-
72
- # Fonctions utilitaires
73
- def extract_text_from_pdf(file_path: str) -> str:
74
- try:
75
- doc = fitz.open(file_path)
76
- return "\n".join([page.get_text() for page in doc])
77
- except Exception as e:
78
- logger.error(f"PDF extraction error: {e}")
79
- raise HTTPException(400, "Erreur d'extraction PDF")
80
-
81
- def extract_text_from_docx(file_path: str) -> str:
82
- try:
83
- doc = Document(file_path)
84
- return "\n".join([para.text for para in doc.paragraphs])
85
- except Exception as e:
86
- logger.error(f"DOCX extraction error: {e}")
87
- raise HTTPException(400, "Erreur d'extraction DOCX")
88
-
89
- def extract_text_from_pptx(file_path: str) -> str:
90
- try:
91
- prs = Presentation(file_path)
92
- text = []
93
- for slide in prs.slides:
94
- for shape in slide.shapes:
95
- if hasattr(shape, "text"):
96
- text.append(shape.text)
97
- return "\n".join(text)
98
- except Exception as e:
99
- logger.error(f"PPTX extraction error: {e}")
100
- raise HTTPException(400, "Erreur d'extraction PPTX")
101
-
102
- def extract_text_from_excel(file_path: str) -> str:
103
- try:
104
- xls = pd.ExcelFile(file_path)
105
- text = []
106
- for sheet_name in xls.sheet_names:
107
- df = pd.read_excel(file_path, sheet_name=sheet_name)
108
- text.append(f"Feuille: {sheet_name}\n{df.to_string()}")
109
- return "\n\n".join(text)
110
- except Exception as e:
111
- logger.error(f"Excel extraction error: {e}")
112
- raise HTTPException(400, "Erreur d'extraction Excel")
113
-
114
- async def process_uploaded_file(file: UploadFile) -> FileInfo:
115
- file_ext = Path(file.filename).suffix.lower()
116
- file_id = str(uuid.uuid4())
117
- file_path = str(UPLOAD_FOLDER / f"{file_id}{file_ext}")
118
-
119
- # Sauvegarde du fichier
120
- with open(file_path, "wb") as buffer:
121
- buffer.write(await file.read())
122
-
123
- # Extraction du texte selon le type de fichier
124
- text = ""
125
- if file_ext == ".pdf":
126
- text = extract_text_from_pdf(file_path)
127
- elif file_ext == ".docx":
128
- text = extract_text_from_docx(file_path)
129
- elif file_ext == ".pptx":
130
- text = extract_text_from_pptx(file_path)
131
- elif file_ext in (".xlsx", ".xls"):
132
- text = extract_text_from_excel(file_path)
133
-
134
- return FileInfo(
135
- file_id=file_id,
136
- file_name=file.filename,
137
- file_type=file_ext[1:],
138
- file_path=file_path,
139
- extracted_text=text if text else None
140
- )
141
-
142
- # Endpoints
143
- @app.get("/", response_class=HTMLResponse)
144
- async def home(request: Request):
145
- return templates.TemplateResponse("index.html", {"request": request})
146
-
147
- @app.post("/api/upload", response_model=List[FileInfo])
148
- async def upload_files(files: List[UploadFile] = File(...)):
149
- try:
150
- processed_files = []
151
- for file in files:
152
- processed_file = await process_uploaded_file(file)
153
- processed_files.append(processed_file)
154
- return processed_files
155
- except Exception as e:
156
- logger.error(f"Upload error: {e}")
157
- raise HTTPException(500, f"Erreur lors de l'upload: {str(e)}")
158
-
159
- @app.post("/api/summarize")
160
- async def summarize_document(request: SummaryRequest):
161
- try:
162
- file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
163
- text = ""
164
-
165
- if file_path.suffix == ".pdf":
166
- text = extract_text_from_pdf(str(file_path))
167
- else:
168
- with open(file_path, "r", encoding="utf-8") as f:
169
- text = f.read()
170
-
171
- prompt = f"""
172
- Résumez ce document de manière concise en français.
173
- Concentrez-vous sur les points principaux.
174
- Le résumé doit faire environ {request.max_length} mots.
175
-
176
- Document:
177
- {text[:5000]}... [truncated]
178
- """
179
-
180
- summary = client.summarization(
181
- text=text,
182
- model=MODELS["summary"],
183
- parameters={"max_length": request.max_length}
184
- )
185
-
186
- return {"summary": summary}
187
- except Exception as e:
188
- logger.error(f"Summarization error: {e}")
189
- raise HTTPException(500, f"Erreur de résumé: {str(e)}")
190
-
191
- @app.post("/api/caption")
192
- async def caption_image(request: CaptionRequest):
193
- try:
194
- file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
195
-
196
- with open(file_path, "rb") as image_file:
197
- image_data = image_file.read()
198
-
199
- caption = client.image_to_text(
200
- image=image_data,
201
- model=MODELS["caption"]
202
- )
203
-
204
- return {"caption": caption}
205
- except Exception as e:
206
- logger.error(f"Captioning error: {e}")
207
- raise HTTPException(500, f"Erreur de description: {str(e)}")
208
-
209
- @app.post("/api/answer")
210
- async def answer_question(request: QARequest):
211
- try:
212
- context = ""
213
- if request.file_id:
214
- file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
215
-
216
- if file_path.suffix in (".jpg", ".jpeg", ".png"):
217
- with open(file_path, "rb") as image_file:
218
- image_data = image_file.read()
219
- context = client.image_to_text(image=image_data, model=MODELS["caption"])
220
- else:
221
- if file_path.suffix == ".pdf":
222
- context = extract_text_from_pdf(str(file_path))
223
- else:
224
- with open(file_path, "r", encoding="utf-8") as f:
225
- context = f.read()
226
-
227
- prompt = f"""
228
- Vous êtes un assistant IA qui répond à des questions en français.
229
- Répondez de manière précise et concise.
230
- Contexte: {context[:3000]}
231
- Question: {request.question}
232
- Réponse:
233
- """
234
-
235
- response = client.chat_completion(
236
- model=MODELS["qa"],
237
- messages=[{"role": "user", "content": prompt}],
238
- max_tokens=500
239
- )
240
-
241
- return {"answer": response.choices[0].message.content}
242
- except Exception as e:
243
- logger.error(f"QA error: {e}")
244
- raise HTTPException(500, f"Erreur de réponse: {str(e)}")
245
-
246
- @app.get("/api/file/{file_id}")
247
- async def get_file(file_id: str):
248
- try:
249
- file_path = next(f for f in UPLOAD_FOLDER.glob(f"{file_id}*"))
250
- return FileResponse(file_path)
251
- except Exception as e:
252
- logger.error(f"File retrieval error: {e}")
253
- raise HTTPException(404, "Fichier non trouvé")
254
-
255
- @app.get("/video-background")
256
- async def get_video():
257
- return FileResponse(BASE_DIR / "frontend/static/videos/background.mp4")
258
-
259
- # Gestion des erreurs
260
- @app.exception_handler(HTTPException)
261
- async def http_exception_handler(request, exc):
262
- return JSONResponse(
263
- status_code=exc.status_code,
264
- content={"detail": exc.detail},
265
- )
266
-
267
- @app.exception_handler(Exception)
268
- async def generic_exception_handler(request, exc):
269
- logger.error(f"Unhandled exception: {exc}")
270
- return JSONResponse(
271
- status_code=500,
272
- content={"detail": "Une erreur interne est survenue"},
273
  )
 
1
+ import os
2
+ import uuid
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.templating import Jinja2Templates
9
+ from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel
12
+ from huggingface_hub import InferenceClient
13
+ import fitz # PyMuPDF
14
+ from PIL import Image
15
+ import io
16
+ import pandas as pd
17
+ from docx import Document
18
+ from pptx import Presentation
19
+
20
+ # Configuration du logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Initialisation de l'application FastAPI
25
+ app = FastAPI()
26
+
27
+ # Configuration CORS
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Chemins des fichiers
36
+ BASE_DIR = Path(__file__).parent.parent
37
+ UPLOAD_FOLDER = Path(__file__).parent / "uploads"
38
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
39
+
40
+ # Modèles Hugging Face
41
+ HF_TOKEN = os.getenv("HF_TOKEN")
42
+ client = InferenceClient(token=HF_TOKEN)
43
+ MODELS = {
44
+ "summary": "facebook/bart-large-cnn",
45
+ "caption": "Salesforce/blip-image-captioning-large",
46
+ "qa": "deepseek-ai/DeepSeek-V2-Chat"
47
+ }
48
+
49
+ # Modèles Pydantic
50
+ class FileInfo(BaseModel):
51
+ file_id: str
52
+ file_name: str
53
+ file_type: str
54
+ file_path: str
55
+ extracted_text: Optional[str] = None
56
+
57
+ class SummaryRequest(BaseModel):
58
+ file_id: str
59
+ max_length: int = 150
60
+
61
+ class CaptionRequest(BaseModel):
62
+ file_id: str
63
+
64
+ class QARequest(BaseModel):
65
+ file_id: Optional[str] = None
66
+ question: str
67
+
68
+ # Initialisation des templates
69
+ templates = Jinja2Templates(directory=str(BASE_DIR / "frontend/templates"))
70
+ app.mount("/static", StaticFiles(directory=str(BASE_DIR / "frontend/static")), name="static")
71
+
72
+ # Fonctions utilitaires
73
+ def extract_text_from_pdf(file_path: str) -> str:
74
+ try:
75
+ doc = fitz.open(file_path)
76
+ return "\n".join([page.get_text() for page in doc])
77
+ except Exception as e:
78
+ logger.error(f"PDF extraction error: {e}")
79
+ raise HTTPException(400, "Erreur d'extraction PDF")
80
+
81
+ def extract_text_from_docx(file_path: str) -> str:
82
+ try:
83
+ doc = Document(file_path)
84
+ return "\n".join([para.text for para in doc.paragraphs])
85
+ except Exception as e:
86
+ logger.error(f"DOCX extraction error: {e}")
87
+ raise HTTPException(400, "Erreur d'extraction DOCX")
88
+
89
+ def extract_text_from_pptx(file_path: str) -> str:
90
+ try:
91
+ prs = Presentation(file_path)
92
+ text = []
93
+ for slide in prs.slides:
94
+ for shape in slide.shapes:
95
+ if hasattr(shape, "text"):
96
+ text.append(shape.text)
97
+ return "\n".join(text)
98
+ except Exception as e:
99
+ logger.error(f"PPTX extraction error: {e}")
100
+ raise HTTPException(400, "Erreur d'extraction PPTX")
101
+
102
+ def extract_text_from_excel(file_path: str) -> str:
103
+ try:
104
+ xls = pd.ExcelFile(file_path)
105
+ text = []
106
+ for sheet_name in xls.sheet_names:
107
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
108
+ text.append(f"Feuille: {sheet_name}\n{df.to_string()}")
109
+ return "\n\n".join(text)
110
+ except Exception as e:
111
+ logger.error(f"Excel extraction error: {e}")
112
+ raise HTTPException(400, "Erreur d'extraction Excel")
113
+
114
+ async def process_uploaded_file(file: UploadFile) -> FileInfo:
115
+ file_ext = Path(file.filename).suffix.lower()
116
+ file_id = str(uuid.uuid4())
117
+ file_path = str(UPLOAD_FOLDER / f"{file_id}{file_ext}")
118
+
119
+ # Sauvegarde du fichier
120
+ with open(file_path, "wb") as buffer:
121
+ buffer.write(await file.read())
122
+
123
+ # Extraction du texte selon le type de fichier
124
+ text = ""
125
+ if file_ext == ".pdf":
126
+ text = extract_text_from_pdf(file_path)
127
+ elif file_ext == ".docx":
128
+ text = extract_text_from_docx(file_path)
129
+ elif file_ext == ".pptx":
130
+ text = extract_text_from_pptx(file_path)
131
+ elif file_ext in (".xlsx", ".xls"):
132
+ text = extract_text_from_excel(file_path)
133
+
134
+ return FileInfo(
135
+ file_id=file_id,
136
+ file_name=file.filename,
137
+ file_type=file_ext[1:],
138
+ file_path=file_path,
139
+ extracted_text=text if text else None
140
+ )
141
+
142
+ # Endpoints
143
+ @app.get("/", response_class=HTMLResponse)
144
+ async def home(request: Request):
145
+ return templates.TemplateResponse("index.html", {"request": request})
146
+
147
+ @app.post("/api/upload", response_model=List[FileInfo])
148
+ async def upload_files(files: List[UploadFile] = File(...)):
149
+ try:
150
+ processed_files = []
151
+ for file in files:
152
+ processed_file = await process_uploaded_file(file)
153
+ processed_files.append(processed_file)
154
+ return processed_files
155
+ except Exception as e:
156
+ logger.error(f"Upload error: {e}")
157
+ raise HTTPException(500, f"Erreur lors de l'upload: {str(e)}")
158
+
159
+ @app.post("/api/summarize")
160
+ async def summarize_document(request: SummaryRequest):
161
+ try:
162
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
163
+ text = ""
164
+
165
+ if file_path.suffix == ".pdf":
166
+ text = extract_text_from_pdf(str(file_path))
167
+ else:
168
+ with open(file_path, "r", encoding="utf-8") as f:
169
+ text = f.read()
170
+
171
+ prompt = f"""
172
+ Résumez ce document de manière concise en français.
173
+ Concentrez-vous sur les points principaux.
174
+ Le résumé doit faire environ {request.max_length} mots.
175
+
176
+ Document:
177
+ {text[:5000]}... [truncated]
178
+ """
179
+
180
+ summary = client.summarization(
181
+ text=text,
182
+ model=MODELS["summary"],
183
+ parameters={"max_length": request.max_length}
184
+ )
185
+
186
+ return {"summary": summary}
187
+ except Exception as e:
188
+ logger.error(f"Summarization error: {e}")
189
+ raise HTTPException(500, f"Erreur de résumé: {str(e)}")
190
+
191
+ @app.post("/api/caption")
192
+ async def caption_image(request: CaptionRequest):
193
+ try:
194
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
195
+
196
+ with open(file_path, "rb") as image_file:
197
+ image_data = image_file.read()
198
+
199
+ caption = client.image_to_text(
200
+ image=image_data,
201
+ model=MODELS["caption"]
202
+ )
203
+
204
+ return {"caption": caption}
205
+ except Exception as e:
206
+ logger.error(f"Captioning error: {e}")
207
+ raise HTTPException(500, f"Erreur de description: {str(e)}")
208
+
209
+ @app.post("/api/answer")
210
+ async def answer_question(request: QARequest):
211
+ try:
212
+ context = ""
213
+ if request.file_id:
214
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
215
+
216
+ if file_path.suffix in (".jpg", ".jpeg", ".png"):
217
+ with open(file_path, "rb") as image_file:
218
+ image_data = image_file.read()
219
+ context = client.image_to_text(image=image_data, model=MODELS["caption"])
220
+ else:
221
+ if file_path.suffix == ".pdf":
222
+ context = extract_text_from_pdf(str(file_path))
223
+ else:
224
+ with open(file_path, "r", encoding="utf-8") as f:
225
+ context = f.read()
226
+
227
+ prompt = f"""
228
+ Vous êtes un assistant IA qui répond à des questions en français.
229
+ Répondez de manière précise et concise.
230
+ Contexte: {context[:3000]}
231
+ Question: {request.question}
232
+ Réponse:
233
+ """
234
+
235
+ response = client.chat_completion(
236
+ model=MODELS["qa"],
237
+ messages=[{"role": "user", "content": prompt}],
238
+ max_tokens=500
239
+ )
240
+
241
+ return {"answer": response.choices[0].message.content}
242
+ except Exception as e:
243
+ logger.error(f"QA error: {e}")
244
+ raise HTTPException(500, f"Erreur de réponse: {str(e)}")
245
+
246
+ @app.get("/api/file/{file_id}")
247
+ async def get_file(file_id: str):
248
+ try:
249
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{file_id}*"))
250
+ return FileResponse(file_path)
251
+ except Exception as e:
252
+ logger.error(f"File retrieval error: {e}")
253
+ raise HTTPException(404, "Fichier non trouvé")
254
+
255
+ @app.get("/video-background")
256
+ async def get_video():
257
+ return FileResponse(BASE_DIR / "frontend/static/videos/background.mp4")
258
+
259
+ # Gestion des erreurs
260
+ @app.exception_handler(HTTPException)
261
+ async def http_exception_handler(request, exc):
262
+ return JSONResponse(
263
+ status_code=exc.status_code,
264
+ content={"detail": exc.detail},
265
+ )
266
+
267
+ @app.exception_handler(Exception)
268
+ async def generic_exception_handler(request, exc):
269
+ logger.error(f"Unhandled exception: {exc}")
270
+ return JSONResponse(
271
+ status_code=500,
272
+ content={"detail": "Une erreur interne est survenue"},
273
  )