doc-maker

Sleeping

App Files Files Community

pvanand commited on Jan 2, 2025

Commit

2519790

verified ·

1 Parent(s): dc8cd59

Update file_conversion.py

Browse files

Files changed (1) hide show

file_conversion.py +114 -92

file_conversion.py CHANGED Viewed

@@ -1,28 +1,44 @@
-from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from pdf2docx import Converter
 import os
-import shutil
 import pdfkit
 import uuid
 router = APIRouter()
 TEMP_DIR = "/.tempfiles"
-class HTMLRequest(BaseModel):
-    html_content: str
 def ensure_temp_dir():
     os.makedirs(TEMP_DIR, exist_ok=True)
-def remove_file(path: str):
-    if os.path.exists(path):
-        os.unlink(path)
-def generate_temp_filepath(extension: str) -> str:
-    return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")
 def html_to_pdf(html_content: str, output_path: str) -> None:
     options = {
@@ -40,102 +56,108 @@ def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
     cv.convert(docx_path)
     cv.close()
-def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
-    try:
-        convert_func(input_path, output_path)
-        if not os.path.exists(output_path):
-            raise FileNotFoundError(f"Converted file not found: {output_path}")
-        background_tasks.add_task(remove_file, input_path)
-        background_tasks.add_task(remove_file, output_path)
-        return FileResponse(
-            output_path,
-            media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-            filename=f"converted_document_{uuid.uuid4()}.docx"
-        )
-    except Exception as e:
-        remove_file(input_path)
-        remove_file(output_path)
-        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
-@router.post("/convert/pdf_to_docx")
-async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
-    if not file.filename.endswith('.pdf'):
-        raise HTTPException(status_code=400, detail="File must be a PDF")
-    ensure_temp_dir()
-    pdf_temp_path = generate_temp_filepath("pdf")
-    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
-    with open(pdf_temp_path, "wb") as pdf_file:
-        shutil.copyfileobj(file.file, pdf_file)
-    return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
-@router.post("/convert/html_to_pdf")
-async def convert_html_to_pdf(request: HTMLRequest):
     ensure_temp_dir()
-    pdf_temp_path = generate_temp_filepath("pdf")
     try:
-        html_to_pdf(request.html_content, pdf_temp_path)
-        with open(pdf_temp_path, "rb") as pdf_file:
-            pdf_content = pdf_file.read()
-        remove_file(pdf_temp_path)
-        return Response(content=pdf_content, media_type="application/pdf")
     except Exception as e:
-        remove_file(pdf_temp_path)
         raise HTTPException(status_code=500, detail=str(e))
-@router.post("/convert/html_to_docx")
-async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
     ensure_temp_dir()
-    pdf_temp_path = generate_temp_filepath("pdf")
-    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
-    try:
-        html_to_pdf(request.html_content, pdf_temp_path)
-        return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
-    except Exception as e:
-        remove_file(pdf_temp_path)
-        remove_file(docx_temp_path)
-        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
-import markdown
-class MarkdownRequest(BaseModel):
-    markdown_content: str
-def markdown_to_html(markdown_content: str) -> str:
-    return markdown.markdown(markdown_content)
-@router.post("/convert/md_to_pdf")
-async def convert_md_to_pdf(request: MarkdownRequest):
-    ensure_temp_dir()
-    pdf_temp_path = generate_temp_filepath("pdf")
     try:
-        html_content = markdown_to_html(request.markdown_content)
-        html_to_pdf(html_content, pdf_temp_path)
-        with open(pdf_temp_path, "rb") as pdf_file:
-            pdf_content = pdf_file.read()
-        remove_file(pdf_temp_path)
-        return Response(content=pdf_content, media_type="application/pdf")
     except Exception as e:
-        remove_file(pdf_temp_path)
         raise HTTPException(status_code=500, detail=str(e))
-@router.post("/convert/md_to_docx")
-async def convert_md_to_docx(background_tasks: BackgroundTasks, request: MarkdownRequest):
-    ensure_temp_dir()
-    pdf_temp_path = generate_temp_filepath("pdf")
-    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
-    try:
-        html_content = markdown_to_html(request.markdown_content)
-        html_to_pdf(html_content, pdf_temp_path)
-        return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
-    except Exception as e:
-        remove_file(pdf_temp_path)
-        remove_file(docx_temp_path)
-        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

+from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from pdf2docx import Converter
 import os
 import pdfkit
 import uuid
+import markdown
+from datetime import datetime, timedelta
+from typing import Optional
 router = APIRouter()
 TEMP_DIR = "/.tempfiles"
+FILE_RETENTION_MINUTES = 30
+BASE_URL = "https://pvanand-doc-maker.hf.space/"
+class MarkdownRequest(BaseModel):
+    markdown_content: str
+class ConversionResponse(BaseModel):
+    download_url: str
+    expires_at: datetime
+# Track converted files and their metadata
+converted_files = {}
 def ensure_temp_dir():
     os.makedirs(TEMP_DIR, exist_ok=True)
+def get_download_url(request: Request, file_id: str) -> str:
+    return f"{BASE_URL}download/{file_id}"
+def generate_temp_filepath(extension: str) -> tuple[str, str]:
+    file_id = str(uuid.uuid4())
+    file_path = os.path.join(TEMP_DIR, f"{file_id}.{extension}")
+    return file_path, file_id
+def markdown_to_html(markdown_content: str) -> str:
+    return markdown.markdown(markdown_content)
 def html_to_pdf(html_content: str, output_path: str) -> None:
     options = {
     cv.convert(docx_path)
     cv.close()
+def cleanup_expired_files(background_tasks: BackgroundTasks):
+    current_time = datetime.utcnow()
+    expired_files = []
+    for file_id, metadata in converted_files.items():
+        if current_time > metadata['expires_at']:
+            if os.path.exists(metadata['file_path']):
+                background_tasks.add_task(os.unlink, metadata['file_path'])
+            expired_files.append(file_id)
+    for file_id in expired_files:
+        converted_files.pop(file_id, None)
+@router.post("/convert/md_to_pdf", response_model=ConversionResponse)
+async def convert_md_to_pdf(
+    request: Request,
+    markdown_req: MarkdownRequest,
+    background_tasks: BackgroundTasks
+):
     ensure_temp_dir()
+    cleanup_expired_files(background_tasks)
+    pdf_path, file_id = generate_temp_filepath("pdf")
     try:
+        html_content = markdown_to_html(markdown_req.markdown_content)
+        html_to_pdf(html_content, pdf_path)
+        expiration_time = datetime.utcnow() + timedelta(minutes=FILE_RETENTION_MINUTES)
+        converted_files[file_id] = {
+            'file_path': pdf_path,
+            'mime_type': 'application/pdf',
+            'expires_at': expiration_time,
+            'extension': 'pdf'
+        }
+        return ConversionResponse(
+            download_url=get_download_url(file_id),
+            expires_at=expiration_time
+        )
     except Exception as e:
+        if os.path.exists(pdf_path):
+            os.unlink(pdf_path)
         raise HTTPException(status_code=500, detail=str(e))
+@router.post("/convert/md_to_docx", response_model=ConversionResponse)
+async def convert_md_to_docx(
+    request: Request,
+    markdown_req: MarkdownRequest,
+    background_tasks: BackgroundTasks
+):
     ensure_temp_dir()
+    cleanup_expired_files(background_tasks)
+    pdf_path = generate_temp_filepath("pdf")[0]
+    docx_path, file_id = generate_temp_filepath("docx")
     try:
+        html_content = markdown_to_html(markdown_req.markdown_content)
+        html_to_pdf(html_content, pdf_path)
+        pdf_to_docx(pdf_path, docx_path)
+        # Clean up intermediate PDF
+        os.unlink(pdf_path)
+        expiration_time = datetime.utcnow() + timedelta(minutes=FILE_RETENTION_MINUTES)
+        converted_files[file_id] = {
+            'file_path': docx_path,
+            'mime_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'expires_at': expiration_time,
+            'extension': 'docx'
+        }
+        return ConversionResponse(
+            download_url=get_download_url(file_id),
+            expires_at=expiration_time
+        )
     except Exception as e:
+        for path in [pdf_path, docx_path]:
+            if os.path.exists(path):
+                os.unlink(path)
         raise HTTPException(status_code=500, detail=str(e))
+@router.get("/download/{file_id}")
+async def download_file(
+    file_id: str,
+    background_tasks: BackgroundTasks
+):
+    cleanup_expired_files(background_tasks)
+    file_info = converted_files.get(file_id)
+    if not file_info:
+        raise HTTPException(status_code=404, detail="File not found or expired")
+    if datetime.utcnow() > file_info['expires_at']:
+        converted_files.pop(file_id, None)
+        if os.path.exists(file_info['file_path']):
+            os.unlink(file_info['file_path'])
+        raise HTTPException(status_code=404, detail="File has expired")
+    return FileResponse(
+        file_info['file_path'],
+        media_type=file_info['mime_type'],
+        filename=f"converted_{file_id}.{file_info['extension']}"
+    )