SamiKLN commited on
Commit
a927fa4
·
verified ·
1 Parent(s): 774eabd

Upload 3 files

Browse files
Files changed (3) hide show
  1. __init__.py +25 -0
  2. main.py +273 -0
  3. models.py +100 -0
__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Package initializer for DocImageAI Explorer API
3
+
4
+ This empty file serves two crucial purposes:
5
+ 1. Marks the 'app' directory as a Python package
6
+ 2. Allows relative imports between modules
7
+ """
8
+
9
+ # Version of the application
10
+ __version__ = "1.0.0"
11
+
12
+ # List of what gets imported with 'from app import *'
13
+ __all__ = [
14
+ 'main', # Main FastAPI application
15
+ 'models', # Pydantic models
16
+ 'utils' # Utility functions
17
+ ]
18
+
19
+ # Optional initialization code
20
+ def init_app():
21
+ """Initialize application components"""
22
+ pass
23
+
24
+ # Run initialization when package is imported
25
+ init_app()
main.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.templating import Jinja2Templates
9
+ from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel
12
+ from huggingface_hub import InferenceClient
13
+ import fitz # PyMuPDF
14
+ from PIL import Image
15
+ import io
16
+ import pandas as pd
17
+ from docx import Document
18
+ from pptx import Presentation
19
+
20
+ # Configuration du logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Initialisation de l'application FastAPI
25
+ app = FastAPI()
26
+
27
+ # Configuration CORS
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Chemins des fichiers
36
+ BASE_DIR = Path(__file__).parent.parent
37
+ UPLOAD_FOLDER = BASE_DIR / "uploads"
38
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
39
+
40
+ # Modèles Hugging Face
41
+ HF_TOKEN = os.getenv("HF_TOKEN")
42
+ client = InferenceClient(token=HF_TOKEN)
43
+ MODELS = {
44
+ "summary": "facebook/bart-large-cnn",
45
+ "caption": "Salesforce/blip-image-captioning-large",
46
+ "qa": "deepseek-ai/DeepSeek-V2-Chat"
47
+ }
48
+
49
+ # Modèles Pydantic
50
+ class FileInfo(BaseModel):
51
+ file_id: str
52
+ file_name: str
53
+ file_type: str
54
+ file_path: str
55
+ extracted_text: Optional[str] = None
56
+
57
+ class SummaryRequest(BaseModel):
58
+ file_id: str
59
+ max_length: int = 150
60
+
61
+ class CaptionRequest(BaseModel):
62
+ file_id: str
63
+
64
+ class QARequest(BaseModel):
65
+ file_id: Optional[str] = None
66
+ question: str
67
+
68
+ # Initialisation des templates
69
+ templates = Jinja2Templates(directory=str(BASE_DIR / "frontend/templates"))
70
+ app.mount("/static", StaticFiles(directory=str(BASE_DIR / "frontend/static")), name="static")
71
+
72
+ # Fonctions utilitaires
73
+ def extract_text_from_pdf(file_path: str) -> str:
74
+ try:
75
+ doc = fitz.open(file_path)
76
+ return "\n".join([page.get_text() for page in doc])
77
+ except Exception as e:
78
+ logger.error(f"PDF extraction error: {e}")
79
+ raise HTTPException(400, "Erreur d'extraction PDF")
80
+
81
+ def extract_text_from_docx(file_path: str) -> str:
82
+ try:
83
+ doc = Document(file_path)
84
+ return "\n".join([para.text for para in doc.paragraphs])
85
+ except Exception as e:
86
+ logger.error(f"DOCX extraction error: {e}")
87
+ raise HTTPException(400, "Erreur d'extraction DOCX")
88
+
89
+ def extract_text_from_pptx(file_path: str) -> str:
90
+ try:
91
+ prs = Presentation(file_path)
92
+ text = []
93
+ for slide in prs.slides:
94
+ for shape in slide.shapes:
95
+ if hasattr(shape, "text"):
96
+ text.append(shape.text)
97
+ return "\n".join(text)
98
+ except Exception as e:
99
+ logger.error(f"PPTX extraction error: {e}")
100
+ raise HTTPException(400, "Erreur d'extraction PPTX")
101
+
102
+ def extract_text_from_excel(file_path: str) -> str:
103
+ try:
104
+ xls = pd.ExcelFile(file_path)
105
+ text = []
106
+ for sheet_name in xls.sheet_names:
107
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
108
+ text.append(f"Feuille: {sheet_name}\n{df.to_string()}")
109
+ return "\n\n".join(text)
110
+ except Exception as e:
111
+ logger.error(f"Excel extraction error: {e}")
112
+ raise HTTPException(400, "Erreur d'extraction Excel")
113
+
114
+ async def process_uploaded_file(file: UploadFile) -> FileInfo:
115
+ file_ext = Path(file.filename).suffix.lower()
116
+ file_id = str(uuid.uuid4())
117
+ file_path = str(UPLOAD_FOLDER / f"{file_id}{file_ext}")
118
+
119
+ # Sauvegarde du fichier
120
+ with open(file_path, "wb") as buffer:
121
+ buffer.write(await file.read())
122
+
123
+ # Extraction du texte selon le type de fichier
124
+ text = ""
125
+ if file_ext == ".pdf":
126
+ text = extract_text_from_pdf(file_path)
127
+ elif file_ext == ".docx":
128
+ text = extract_text_from_docx(file_path)
129
+ elif file_ext == ".pptx":
130
+ text = extract_text_from_pptx(file_path)
131
+ elif file_ext in (".xlsx", ".xls"):
132
+ text = extract_text_from_excel(file_path)
133
+
134
+ return FileInfo(
135
+ file_id=file_id,
136
+ file_name=file.filename,
137
+ file_type=file_ext[1:],
138
+ file_path=file_path,
139
+ extracted_text=text if text else None
140
+ )
141
+
142
+ # Endpoints
143
+ @app.get("/", response_class=HTMLResponse)
144
+ async def home(request: Request):
145
+ return templates.TemplateResponse("index.html", {"request": request})
146
+
147
+ @app.post("/api/upload", response_model=List[FileInfo])
148
+ async def upload_files(files: List[UploadFile] = File(...)):
149
+ try:
150
+ processed_files = []
151
+ for file in files:
152
+ processed_file = await process_uploaded_file(file)
153
+ processed_files.append(processed_file)
154
+ return processed_files
155
+ except Exception as e:
156
+ logger.error(f"Upload error: {e}")
157
+ raise HTTPException(500, f"Erreur lors de l'upload: {str(e)}")
158
+
159
+ @app.post("/api/summarize")
160
+ async def summarize_document(request: SummaryRequest):
161
+ try:
162
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
163
+ text = ""
164
+
165
+ if file_path.suffix == ".pdf":
166
+ text = extract_text_from_pdf(str(file_path))
167
+ else:
168
+ with open(file_path, "r", encoding="utf-8") as f:
169
+ text = f.read()
170
+
171
+ prompt = f"""
172
+ Résumez ce document de manière concise en français.
173
+ Concentrez-vous sur les points principaux.
174
+ Le résumé doit faire environ {request.max_length} mots.
175
+
176
+ Document:
177
+ {text[:5000]}... [truncated]
178
+ """
179
+
180
+ summary = client.summarization(
181
+ text=text,
182
+ model=MODELS["summary"],
183
+ parameters={"max_length": request.max_length}
184
+ )
185
+
186
+ return {"summary": summary}
187
+ except Exception as e:
188
+ logger.error(f"Summarization error: {e}")
189
+ raise HTTPException(500, f"Erreur de résumé: {str(e)}")
190
+
191
+ @app.post("/api/caption")
192
+ async def caption_image(request: CaptionRequest):
193
+ try:
194
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
195
+
196
+ with open(file_path, "rb") as image_file:
197
+ image_data = image_file.read()
198
+
199
+ caption = client.image_to_text(
200
+ image=image_data,
201
+ model=MODELS["caption"]
202
+ )
203
+
204
+ return {"caption": caption}
205
+ except Exception as e:
206
+ logger.error(f"Captioning error: {e}")
207
+ raise HTTPException(500, f"Erreur de description: {str(e)}")
208
+
209
+ @app.post("/api/answer")
210
+ async def answer_question(request: QARequest):
211
+ try:
212
+ context = ""
213
+ if request.file_id:
214
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
215
+
216
+ if file_path.suffix in (".jpg", ".jpeg", ".png"):
217
+ with open(file_path, "rb") as image_file:
218
+ image_data = image_file.read()
219
+ context = client.image_to_text(image=image_data, model=MODELS["caption"])
220
+ else:
221
+ if file_path.suffix == ".pdf":
222
+ context = extract_text_from_pdf(str(file_path))
223
+ else:
224
+ with open(file_path, "r", encoding="utf-8") as f:
225
+ context = f.read()
226
+
227
+ prompt = f"""
228
+ Vous êtes un assistant IA qui répond à des questions en français.
229
+ Répondez de manière précise et concise.
230
+ Contexte: {context[:3000]}
231
+ Question: {request.question}
232
+ Réponse:
233
+ """
234
+
235
+ response = client.chat_completion(
236
+ model=MODELS["qa"],
237
+ messages=[{"role": "user", "content": prompt}],
238
+ max_tokens=500
239
+ )
240
+
241
+ return {"answer": response.choices[0].message.content}
242
+ except Exception as e:
243
+ logger.error(f"QA error: {e}")
244
+ raise HTTPException(500, f"Erreur de réponse: {str(e)}")
245
+
246
+ @app.get("/api/file/{file_id}")
247
+ async def get_file(file_id: str):
248
+ try:
249
+ file_path = next(f for f in UPLOAD_FOLDER.glob(f"{file_id}*"))
250
+ return FileResponse(file_path)
251
+ except Exception as e:
252
+ logger.error(f"File retrieval error: {e}")
253
+ raise HTTPException(404, "Fichier non trouvé")
254
+
255
+ @app.get("/video-background")
256
+ async def get_video():
257
+ return FileResponse(BASE_DIR / "frontend/static/videos/background.mp4")
258
+
259
+ # Gestion des erreurs
260
+ @app.exception_handler(HTTPException)
261
+ async def http_exception_handler(request, exc):
262
+ return JSONResponse(
263
+ status_code=exc.status_code,
264
+ content={"detail": exc.detail},
265
+ )
266
+
267
+ @app.exception_handler(Exception)
268
+ async def generic_exception_handler(request, exc):
269
+ logger.error(f"Unhandled exception: {exc}")
270
+ return JSONResponse(
271
+ status_code=500,
272
+ content={"detail": "Une erreur interne est survenue"},
273
+ )
models.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Union
2
+ from pydantic import BaseModel, Field, HttpUrl
3
+ from datetime import datetime
4
+ from enum import Enum
5
+
6
+ class FileType(str, Enum):
7
+ PDF = "pdf"
8
+ DOCX = "docx"
9
+ PPTX = "pptx"
10
+ XLSX = "xlsx"
11
+ JPG = "jpg"
12
+ JPEG = "jpeg"
13
+ PNG = "png"
14
+
15
+ class UploadedFile(BaseModel):
16
+ """
17
+ Modèle pour les fichiers uploadés
18
+ """
19
+ file_id: str = Field(..., description="ID unique du fichier")
20
+ file_name: str = Field(..., description="Nom original du fichier")
21
+ file_type: FileType = Field(..., description="Type du fichier")
22
+ file_path: str = Field(..., description="Chemin d'accès interne")
23
+ file_size: int = Field(..., description="Taille en octets")
24
+ upload_date: datetime = Field(default_factory=datetime.now, description="Date d'upload")
25
+ extracted_text: Optional[str] = Field(None, description="Texte extrait le cas échéant")
26
+
27
+ class Config:
28
+ json_schema_extra = {
29
+ "example": {
30
+ "file_id": "550e8400-e29b-41d4-a716-446655440000",
31
+ "file_name": "document.pdf",
32
+ "file_type": "pdf",
33
+ "file_path": "/uploads/550e8400-e29b-41d4-a716-446655440000.pdf",
34
+ "file_size": 1024,
35
+ "upload_date": "2023-01-01T00:00:00",
36
+ "extracted_text": "Lorem ipsum..."
37
+ }
38
+ }
39
+
40
+ class SummaryRequest(BaseModel):
41
+ """
42
+ Modèle pour les requêtes de résumé
43
+ """
44
+ file_id: str = Field(..., description="ID du fichier à résumer")
45
+ max_length: int = Field(150, gt=50, lt=500, description="Longueur maximale du résumé (50-500 mots)")
46
+
47
+ class SummaryResponse(BaseModel):
48
+ """
49
+ Modèle pour les réponses de résumé
50
+ """
51
+ original_length: int = Field(..., description="Nombre de mots original")
52
+ summary_length: int = Field(..., description="Nombre de mots du résumé")
53
+ summary: str = Field(..., description="Résumé généré")
54
+ processing_time: float = Field(..., description="Temps de traitement en secondes")
55
+
56
+ class ImageCaptionRequest(BaseModel):
57
+ """
58
+ Modèle pour les requêtes de description d'image
59
+ """
60
+ file_id: str = Field(..., description="ID du fichier image")
61
+ detail_level: str = Field("normal", regex="^(low|normal|high)$")
62
+
63
+ class ImageCaptionResponse(BaseModel):
64
+ """
65
+ Modèle pour les réponses de description d'image
66
+ """
67
+ caption: str = Field(..., description="Description générée")
68
+ confidence: float = Field(..., ge=0, le=1, description="Confiance du modèle (0-1)")
69
+
70
+ class QARequest(BaseModel):
71
+ """
72
+ Modèle pour les requêtes de questions/réponses
73
+ """
74
+ file_id: Optional[str] = Field(None, description="ID du fichier de référence (optionnel)")
75
+ question: str = Field(..., min_length=5, description="Question à poser")
76
+ context: Optional[str] = Field(None, description="Contexte supplémentaire")
77
+
78
+ class QAResponse(BaseModel):
79
+ """
80
+ Modèle pour les réponses aux questions
81
+ """
82
+ answer: str = Field(..., description="Réponse générée")
83
+ source: Optional[str] = Field(None, description="Source de la réponse le cas échéant")
84
+ confidence: Optional[float] = Field(None, ge=0, le=1, description="Niveau de confiance")
85
+
86
+ class ErrorResponse(BaseModel):
87
+ """
88
+ Modèle standard pour les erreurs
89
+ """
90
+ error: str = Field(..., description="Message d'erreur")
91
+ code: int = Field(..., description="Code HTTP")
92
+ details: Optional[Union[dict, list]] = Field(None, description="Détails supplémentaires")
93
+
94
+ class HealthCheck(BaseModel):
95
+ """
96
+ Modèle pour le endpoint de santé
97
+ """
98
+ status: str = Field(..., description="Statut du service")
99
+ version: str = Field(..., description="Version de l'API")
100
+ models_ready: bool = Field(..., description="Les modèles IA sont-ils chargés ?")