Spaces:
Running
Running
| from __future__ import annotations | |
| import asyncio | |
| import concurrent.futures | |
| import json as json_mod | |
| import os | |
| from typing import Annotated, Any, Dict, Optional | |
| from urllib.parse import urlparse | |
| import httpx | |
| from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile, status | |
| from pydantic import BaseModel | |
| from app.config import get_settings | |
| from app.api.deps import ( | |
| get_converter_service, | |
| get_extraction_service, | |
| get_text_cleaner_service, | |
| require_auth, | |
| ) | |
| from app.core.logger import get_logger | |
| from app.models.domain import ConversionError, count_tokens | |
| from app.models.schemas import ConversionMetadata, ConversionResponse, UrlRequest | |
| from app.services.converter_service import ConverterService | |
| from app.services.extraction_service import ExtractionService | |
| from app.services.text_cleaner_service import TextCleanerService | |
| router = APIRouter() | |
| _logger = get_logger(__name__) | |
| _settings = get_settings() | |
| _MAX_UPLOAD_BYTES = _settings.max_upload_bytes | |
| _MAX_WORKERS = min(32, (os.cpu_count() or 1) + 4) | |
| _thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=_MAX_WORKERS) | |
| def _build_metadata(result) -> ConversionMetadata: | |
| return ConversionMetadata( | |
| source=result.source, | |
| char_count=result.char_count, | |
| word_count=result.word_count, | |
| line_count=result.line_count, | |
| file_size_bytes=result.file_size_bytes, | |
| mime_type=result.mime_type, | |
| content_hash=result.content_hash, | |
| token_estimate=result.token_estimate, | |
| ) | |
| async def _build_response( | |
| result, | |
| *, | |
| return_json: bool = False, | |
| filename: Optional[str] = None, | |
| raw_data: Optional[bytes] = None, | |
| mappings: Optional[Dict[str, Dict[str, Any]]] = None, | |
| extraction_service: ExtractionService = None, | |
| clean_content: bool = False, | |
| text_cleaner_service: TextCleanerService = None, | |
| ) -> ConversionResponse: | |
| json_content: Optional[Any] = None | |
| error_message: Optional[str] = None | |
| content = result.markdown | |
| if clean_content and text_cleaner_service: | |
| loop = asyncio.get_running_loop() | |
| content = await loop.run_in_executor(_thread_pool, text_cleaner_service.clean, content) | |
| if return_json and filename and extraction_service: | |
| loop = asyncio.get_running_loop() | |
| json_result = await loop.run_in_executor( | |
| _thread_pool, | |
| extraction_service.extract_structured, | |
| filename, | |
| content, | |
| mappings, | |
| raw_data, | |
| ) | |
| if "error" in json_result: | |
| error_message = json_result["error"] | |
| else: | |
| json_content = json_result | |
| if clean_content and content != result.markdown: | |
| lines = content.splitlines() | |
| words = content.split() | |
| import hashlib | |
| metadata = ConversionMetadata( | |
| source=result.source, | |
| char_count=len(content), | |
| word_count=len(words), | |
| line_count=len(lines), | |
| file_size_bytes=result.file_size_bytes, | |
| mime_type=result.mime_type, | |
| content_hash=hashlib.sha256(content.encode()).hexdigest(), | |
| token_estimate=max(1, count_tokens(content)), | |
| ) | |
| else: | |
| metadata = _build_metadata(result) | |
| return ConversionResponse( | |
| success=True, | |
| time_ms=round(result.duration_ms, 3), | |
| content=content, | |
| return_json=return_json, | |
| json_content=json_content, | |
| metadata=metadata, | |
| error_message=error_message, | |
| ) | |
| def _raise_for_error(outcome: ConversionError) -> None: | |
| status_map = { | |
| "FileNotFoundError": status.HTTP_404_NOT_FOUND, | |
| "ValueError": status.HTTP_422_UNPROCESSABLE_ENTITY, | |
| "PermissionError": status.HTTP_403_FORBIDDEN, | |
| } | |
| code = status_map.get(outcome.error_type, status.HTTP_500_INTERNAL_SERVER_ERROR) | |
| raise HTTPException( | |
| status_code=code, | |
| detail={ | |
| "success": False, | |
| "error_type": outcome.error_type, | |
| "message": outcome.message, | |
| "time_ms": round(outcome.duration_ms, 3), | |
| }, | |
| ) | |
| async def convert_file( | |
| file: Annotated[UploadFile, File(description="File to convert")], | |
| plain_text: bool = Form(False), | |
| return_json: bool = Form(False), | |
| clean_content: bool = Query(False), | |
| mappings: Optional[str] = Form(None, description="JSON string with field mappings"), | |
| token: str = Depends(require_auth), | |
| converter_service: ConverterService = Depends(get_converter_service), | |
| extraction_service: ExtractionService = Depends(get_extraction_service), | |
| text_cleaner_service: TextCleanerService = Depends(get_text_cleaner_service), | |
| ): | |
| if file is None: | |
| raise HTTPException(status_code=400, detail={"success": False, "message": "No file provided."}) | |
| parsed_mappings = None | |
| if mappings: | |
| try: | |
| parsed_mappings = json_mod.loads(mappings) | |
| except json_mod.JSONDecodeError: | |
| raise HTTPException(status_code=400, detail={"success": False, "message": "Invalid JSON in mappings parameter."}) | |
| _logger.info("Received request to convert file: %s", file.filename) | |
| raw = await file.read() | |
| if len(raw) > _MAX_UPLOAD_BYTES: | |
| _logger.error("File %s exceeds %s MB limit", file.filename, _settings.max_upload_mb) | |
| raise HTTPException(status_code=413, detail={"success": False, "message": f"File exceeds {_settings.max_upload_mb} MB limit."}) | |
| loop = asyncio.get_running_loop() | |
| outcome = await loop.run_in_executor(_thread_pool, converter_service.convert_stream, raw, file.filename or "upload") | |
| if isinstance(outcome, ConversionError): | |
| _logger.error("Conversion failed for %s: %s", file.filename, outcome.message) | |
| _raise_for_error(outcome) | |
| _logger.info("Conversion successful for %s", file.filename) | |
| content = outcome.markdown | |
| if clean_content: | |
| content = await loop.run_in_executor(_thread_pool, text_cleaner_service.clean, content) | |
| if plain_text: | |
| from fastapi.responses import PlainTextResponse | |
| return PlainTextResponse(content) | |
| response = await _build_response( | |
| outcome, | |
| return_json=return_json, | |
| filename=file.filename, | |
| raw_data=raw, | |
| mappings=parsed_mappings, | |
| extraction_service=extraction_service, | |
| clean_content=clean_content, | |
| text_cleaner_service=text_cleaner_service, | |
| ) | |
| _logger.info("Request completed for %s", file.filename) | |
| return response | |
| async def convert_url( | |
| body: UrlRequest, | |
| clean_content: bool = Query(False), | |
| token: str = Depends(require_auth), | |
| converter_service: ConverterService = Depends(get_converter_service), | |
| extraction_service: ExtractionService = Depends(get_extraction_service), | |
| text_cleaner_service: TextCleanerService = Depends(get_text_cleaner_service), | |
| ): | |
| _logger.info("Received request to convert URL: %s", body.url) | |
| parsed = urlparse(body.url) | |
| filename = parsed.path.split("/")[-1] or "url_content" | |
| if body.return_json: | |
| try: | |
| _logger.info("Fetching URL for JSON extraction: %s", body.url) | |
| async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: | |
| resp = await client.get(body.url) | |
| resp.raise_for_status() | |
| raw_data = resp.content | |
| if len(raw_data) > _MAX_UPLOAD_BYTES: | |
| raise HTTPException(status_code=413, detail={"success": False, "message": f"File exceeds {_settings.max_upload_mb} MB limit."}) | |
| loop = asyncio.get_running_loop() | |
| outcome = await loop.run_in_executor(_thread_pool, converter_service.convert_stream, raw_data, filename) | |
| if isinstance(outcome, ConversionError): | |
| _logger.error("Conversion failed for URL %s: %s", body.url, outcome.message) | |
| _raise_for_error(outcome) | |
| _logger.info("Conversion successful for URL %s", body.url) | |
| response = await _build_response( | |
| outcome, | |
| return_json=body.return_json, | |
| filename=filename, | |
| raw_data=raw_data, | |
| mappings=body.mappings, | |
| extraction_service=extraction_service, | |
| clean_content=clean_content, | |
| text_cleaner_service=text_cleaner_service, | |
| ) | |
| _logger.info("Request completed for URL %s", body.url) | |
| return response | |
| except httpx.HTTPError as exc: | |
| _logger.error("Failed to fetch URL %s: %s", body.url, exc) | |
| raise HTTPException(status_code=400, detail={"success": False, "message": f"Failed to fetch URL: {exc}"}) | |
| loop = asyncio.get_running_loop() | |
| outcome = await loop.run_in_executor(_thread_pool, converter_service.convert_url, body.url) | |
| if isinstance(outcome, ConversionError): | |
| _logger.error("Conversion failed for URL %s: %s", body.url, outcome.message) | |
| _raise_for_error(outcome) | |
| _logger.info("Conversion successful for URL %s", body.url) | |
| response = await _build_response( | |
| outcome, | |
| return_json=body.return_json, | |
| filename=filename, | |
| mappings=body.mappings, | |
| extraction_service=extraction_service, | |
| clean_content=clean_content, | |
| text_cleaner_service=text_cleaner_service, | |
| ) | |
| _logger.info("Request completed for URL %s", body.url) | |
| return response | |