File size: 8,775 Bytes
41624cc
 
 
 
 
d058c7e
41624cc
d058c7e
41624cc
 
 
 
d058c7e
 
41624cc
 
 
18d631d
41624cc
d058c7e
 
bfcdd10
 
41624cc
 
 
387f195
41624cc
 
 
387f195
41624cc
387f195
41624cc
 
 
 
 
d058c7e
ccf72f9
387f195
d058c7e
41624cc
 
 
 
9199f86
41624cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387f195
41624cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387f195
d058c7e
 
 
 
387f195
 
 
 
41624cc
 
d058c7e
41624cc
 
 
 
 
d058c7e
41624cc
 
 
 
 
387f195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41624cc
 
 
 
 
d59b7f5
 
 
 
 
 
db0f953
d59b7f5
 
 
 
 
41624cc
387f195
 
 
18d631d
 
6c86755
1ea5059
 
 
 
 
6c86755
387f195
6c86755
18d631d
 
 
 
 
 
387f195
3ce9638
41624cc
 
 
 
387f195
 
 
 
 
 
 
 
 
 
d058c7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387f195
d058c7e
41624cc
 
 
387f195
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import os
import uuid
import logging
from pathlib import Path
from typing import List, Optional
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import fitz  # PyMuPDF
from PIL import Image
import io
import pandas as pd
from docx import Document
from pptx import Presentation
import json 

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialisation de l'application FastAPI
app = FastAPI()

# Configuration CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["POST", "GET", "PUT", "DELETE", "OPTIONS"],
    allow_headers=["*"],
    allow_credentials=True,
)

# Chemins des fichiers
BASE_DIR = Path(__file__).parent
UPLOAD_FOLDER = BASE_DIR / "uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# Configuration des modèles Hugging Face
HF_TOKEN = os.getenv("HF_TOKEN")
client = InferenceClient(token=HF_TOKEN)
MODELS = {
    "summary": "facebook/bart-large-cnn",
    "caption": "Salesforce/blip-image-captioning-large",
    "qa": "distilbert-base-cased-distilled-squad"  # plus léger
}

# Modèles Pydantic
class FileInfo(BaseModel):
    file_id: str
    file_name: str
    file_type: str
    file_path: str
    extracted_text: Optional[str] = None

class SummaryRequest(BaseModel):
    file_id: str
    max_length: int = 150

class CaptionRequest(BaseModel):
    file_id: str

class QARequest(BaseModel):
    file_id: Optional[str] = None
    question: str

# Fonctions utilitaires
def extract_text_from_pdf(file_path: str) -> str:
    try:
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    except Exception as e:
        logger.error(f"PDF extraction error: {e}")
        raise HTTPException(400, "Erreur d'extraction PDF")

def extract_text_from_docx(file_path: str) -> str:
    try:
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        logger.error(f"DOCX extraction error: {e}")
        raise HTTPException(400, "Erreur d'extraction DOCX")

def extract_text_from_pptx(file_path: str) -> str:
    try:
        prs = Presentation(file_path)
        text = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
        return "\n".join(text)
    except Exception as e:
        logger.error(f"PPTX extraction error: {e}")
        raise HTTPException(400, "Erreur d'extraction PPTX")

def extract_text_from_excel(file_path: str) -> str:
    try:
        xls = pd.ExcelFile(file_path)
        text = []
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
            text.append(f"Feuille: {sheet_name}\n{df.to_string()}")
        return "\n\n".join(text)
    except Exception as e:
        logger.error(f"Excel extraction error: {e}")
        raise HTTPException(400, "Erreur d'extraction Excel")

async def process_uploaded_file(file: UploadFile) -> FileInfo:
    file_ext = Path(file.filename).suffix.lower()
    file_id = str(uuid.uuid4())
    file_path = str(UPLOAD_FOLDER / f"{file_id}{file_ext}")
    
    with open(file_path, "wb") as buffer:
        buffer.write(await file.read())
    
    text = ""
    if file_ext == ".pdf":
        text = extract_text_from_pdf(file_path)
    elif file_ext == ".docx":
        text = extract_text_from_docx(file_path)
    elif file_ext == ".pptx":
        text = extract_text_from_pptx(file_path)
    elif file_ext in (".xlsx", ".xls"):
        text = extract_text_from_excel(file_path)
    
    return FileInfo(
        file_id=file_id,
        file_name=file.filename,
        file_type=file_ext[1:],
        file_path=file_path,
        extracted_text=text if text else None
    )

# Routes de l'API
@app.get("/api/test")
async def test_api():
    return {"status": "API working", "environment": "Hugging Face" if os.environ.get("HF_SPACE") else "Local"}

@app.get("/api")
async def api_root():
    return {"status": "API is running"}

@app.post("/api/upload")
async def upload_files(files: List[UploadFile] = File(...)):
    logger.info(f"Upload request received with {len(files)} files")
    try:
        processed_files = []
        for file in files:
            processed_file = await process_uploaded_file(file)
            processed_files.append(processed_file)
        logger.info(f"Files processed successfully: {len(processed_files)}")
        return processed_files
    except Exception as e:
        logger.error(f"Upload error: {e}")
        raise HTTPException(500, f"Erreur lors de l'upload: {str(e)}")

@app.post("/api/summarize")
async def summarize_document(request: SummaryRequest):
    try:
        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
        text = ""
        
        if file_path.suffix == ".pdf":
            text = extract_text_from_pdf(str(file_path))
        else:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
        
        summary = client.summarization(
            text=text[:5000],  # limite si le document est trop long
            model=MODELS["summary"],
            parameters={"max_length": request.max_length}
        )
        
        return {"summary": summary}
    except Exception as e:
        logger.error(f"Summarization error: {e}")
        raise HTTPException(500, f"Erreur de résumé: {str(e)}")

@app.post("/api/caption")
async def caption_image(request: CaptionRequest):
    try:
        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
        
        with open(file_path, "rb") as image_file:
            image_data = image_file.read()
        
        caption = client.image_to_text(
            image=image_data,
            model=MODELS["caption"]
        )
        
        return {"caption": caption}
    except Exception as e:
        logger.error(f"Captioning error: {e}")
        raise HTTPException(500, f"Erreur de description: {str(e)}")

@app.post("/api/answer")
async def answer_question(request: QARequest):
    try:
        context = ""
        if request.file_id:
            file_path = next(f for f in UPLOAD_FOLDER.glob(f"{request.file_id}*"))
            
            if file_path.suffix in (".jpg", ".jpeg", ".png"):
                with open(file_path, "rb") as image_file:
                    image_data = image_file.read()
                context = client.image_to_text(image=image_data, model=MODELS["caption"])
            else:
                if file_path.suffix == ".pdf":
                    context = extract_text_from_pdf(str(file_path))
                else:
                    with open(file_path, "r", encoding="utf-8") as f:
                        context = f.read()
        
        if not context:
            raise HTTPException(400, "Aucun contexte trouvé pour répondre à la question.")
        
        # Après l'appel
        raw_response = client.post(
            model=MODELS["qa"],
            json={
                "inputs": {
                    "question": request.question,
                    "context": context
                }
            }
        )

        # Décoder proprement
        response = json.loads(raw_response)




        
        return {"answer": response["answer"]}
    except Exception as e:
        logger.error(f"QA error: {e}")
        raise HTTPException(500, f"Erreur de réponse: {str(e)}")

@app.get("/api/file/{file_id}")
async def get_file(file_id: str):
    try:
        file_path = next(f for f in UPLOAD_FOLDER.glob(f"{file_id}*"))
        return FileResponse(file_path)
    except Exception as e:
        logger.error(f"File retrieval error: {e}")
        raise HTTPException(404, "Fichier non trouvé")

# Gestion des erreurs globales
@app.exception_handler(HTTPException)
async def http_exception_handler(request, exc):
    return JSONResponse(
        status_code=exc.status_code,
        content={"detail": exc.detail},
    )

@app.exception_handler(Exception)
async def generic_exception_handler(request, exc):
    logger.error(f"Unhandled exception: {exc}")
    return JSONResponse(
        status_code=500,
        content={"detail": "Une erreur interne est survenue"},
    )

# Montage des fichiers statiques
app.mount("/", StaticFiles(directory=BASE_DIR, html=True), name="static")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)