pvanand commited on
Commit
2519790
·
verified ·
1 Parent(s): dc8cd59

Update file_conversion.py

Browse files
Files changed (1) hide show
  1. file_conversion.py +114 -92
file_conversion.py CHANGED
@@ -1,28 +1,44 @@
1
- from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
2
  from fastapi.responses import FileResponse
3
  from pydantic import BaseModel
4
  from pdf2docx import Converter
5
  import os
6
- import shutil
7
  import pdfkit
8
  import uuid
 
 
 
9
 
10
  router = APIRouter()
11
 
12
  TEMP_DIR = "/.tempfiles"
 
 
 
 
 
 
 
 
 
13
 
14
- class HTMLRequest(BaseModel):
15
- html_content: str
16
 
17
  def ensure_temp_dir():
18
  os.makedirs(TEMP_DIR, exist_ok=True)
19
 
20
- def remove_file(path: str):
21
- if os.path.exists(path):
22
- os.unlink(path)
23
 
24
- def generate_temp_filepath(extension: str) -> str:
25
- return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")
 
 
 
 
 
26
 
27
  def html_to_pdf(html_content: str, output_path: str) -> None:
28
  options = {
@@ -40,102 +56,108 @@ def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
40
  cv.convert(docx_path)
41
  cv.close()
42
 
43
- def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
44
- try:
45
- convert_func(input_path, output_path)
46
- if not os.path.exists(output_path):
47
- raise FileNotFoundError(f"Converted file not found: {output_path}")
48
- background_tasks.add_task(remove_file, input_path)
49
- background_tasks.add_task(remove_file, output_path)
50
- return FileResponse(
51
- output_path,
52
- media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
53
- filename=f"converted_document_{uuid.uuid4()}.docx"
54
- )
55
- except Exception as e:
56
- remove_file(input_path)
57
- remove_file(output_path)
58
- raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
59
-
60
- @router.post("/convert/pdf_to_docx")
61
- async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
62
- if not file.filename.endswith('.pdf'):
63
- raise HTTPException(status_code=400, detail="File must be a PDF")
64
-
65
- ensure_temp_dir()
66
- pdf_temp_path = generate_temp_filepath("pdf")
67
- docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
68
 
69
- with open(pdf_temp_path, "wb") as pdf_file:
70
- shutil.copyfileobj(file.file, pdf_file)
 
 
 
71
 
72
- return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
73
-
74
- @router.post("/convert/html_to_pdf")
75
- async def convert_html_to_pdf(request: HTMLRequest):
 
 
 
 
 
76
  ensure_temp_dir()
77
- pdf_temp_path = generate_temp_filepath("pdf")
 
 
78
 
79
  try:
80
- html_to_pdf(request.html_content, pdf_temp_path)
81
- with open(pdf_temp_path, "rb") as pdf_file:
82
- pdf_content = pdf_file.read()
83
- remove_file(pdf_temp_path)
84
- return Response(content=pdf_content, media_type="application/pdf")
 
 
 
 
 
 
 
 
 
 
85
  except Exception as e:
86
- remove_file(pdf_temp_path)
 
87
  raise HTTPException(status_code=500, detail=str(e))
88
 
89
- @router.post("/convert/html_to_docx")
90
- async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
 
 
 
 
91
  ensure_temp_dir()
92
- pdf_temp_path = generate_temp_filepath("pdf")
93
- docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
94
 
95
- try:
96
- html_to_pdf(request.html_content, pdf_temp_path)
97
- return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
98
- except Exception as e:
99
- remove_file(pdf_temp_path)
100
- remove_file(docx_temp_path)
101
- raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
102
-
103
-
104
- import markdown
105
-
106
- class MarkdownRequest(BaseModel):
107
- markdown_content: str
108
-
109
- def markdown_to_html(markdown_content: str) -> str:
110
- return markdown.markdown(markdown_content)
111
-
112
- @router.post("/convert/md_to_pdf")
113
- async def convert_md_to_pdf(request: MarkdownRequest):
114
- ensure_temp_dir()
115
- pdf_temp_path = generate_temp_filepath("pdf")
116
 
117
  try:
118
- html_content = markdown_to_html(request.markdown_content)
119
- html_to_pdf(html_content, pdf_temp_path)
120
- with open(pdf_temp_path, "rb") as pdf_file:
121
- pdf_content = pdf_file.read()
122
- remove_file(pdf_temp_path)
123
- return Response(content=pdf_content, media_type="application/pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
- remove_file(pdf_temp_path)
 
 
126
  raise HTTPException(status_code=500, detail=str(e))
127
 
128
- @router.post("/convert/md_to_docx")
129
- async def convert_md_to_docx(background_tasks: BackgroundTasks, request: MarkdownRequest):
130
- ensure_temp_dir()
131
- pdf_temp_path = generate_temp_filepath("pdf")
132
- docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
 
133
 
134
- try:
135
- html_content = markdown_to_html(request.markdown_content)
136
- html_to_pdf(html_content, pdf_temp_path)
137
- return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
138
- except Exception as e:
139
- remove_file(pdf_temp_path)
140
- remove_file(docx_temp_path)
141
- raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, BackgroundTasks, Request
2
  from fastapi.responses import FileResponse
3
  from pydantic import BaseModel
4
  from pdf2docx import Converter
5
  import os
 
6
  import pdfkit
7
  import uuid
8
+ import markdown
9
+ from datetime import datetime, timedelta
10
+ from typing import Optional
11
 
12
  router = APIRouter()
13
 
14
  TEMP_DIR = "/.tempfiles"
15
+ FILE_RETENTION_MINUTES = 30
16
+ BASE_URL = "https://pvanand-doc-maker.hf.space/"
17
+
18
+ class MarkdownRequest(BaseModel):
19
+ markdown_content: str
20
+
21
+ class ConversionResponse(BaseModel):
22
+ download_url: str
23
+ expires_at: datetime
24
 
25
+ # Track converted files and their metadata
26
+ converted_files = {}
27
 
28
  def ensure_temp_dir():
29
  os.makedirs(TEMP_DIR, exist_ok=True)
30
 
31
+ def get_download_url(request: Request, file_id: str) -> str:
32
+
33
+ return f"{BASE_URL}download/{file_id}"
34
 
35
+ def generate_temp_filepath(extension: str) -> tuple[str, str]:
36
+ file_id = str(uuid.uuid4())
37
+ file_path = os.path.join(TEMP_DIR, f"{file_id}.{extension}")
38
+ return file_path, file_id
39
+
40
+ def markdown_to_html(markdown_content: str) -> str:
41
+ return markdown.markdown(markdown_content)
42
 
43
  def html_to_pdf(html_content: str, output_path: str) -> None:
44
  options = {
 
56
  cv.convert(docx_path)
57
  cv.close()
58
 
59
+ def cleanup_expired_files(background_tasks: BackgroundTasks):
60
+ current_time = datetime.utcnow()
61
+ expired_files = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ for file_id, metadata in converted_files.items():
64
+ if current_time > metadata['expires_at']:
65
+ if os.path.exists(metadata['file_path']):
66
+ background_tasks.add_task(os.unlink, metadata['file_path'])
67
+ expired_files.append(file_id)
68
 
69
+ for file_id in expired_files:
70
+ converted_files.pop(file_id, None)
71
+
72
+ @router.post("/convert/md_to_pdf", response_model=ConversionResponse)
73
+ async def convert_md_to_pdf(
74
+ request: Request,
75
+ markdown_req: MarkdownRequest,
76
+ background_tasks: BackgroundTasks
77
+ ):
78
  ensure_temp_dir()
79
+ cleanup_expired_files(background_tasks)
80
+
81
+ pdf_path, file_id = generate_temp_filepath("pdf")
82
 
83
  try:
84
+ html_content = markdown_to_html(markdown_req.markdown_content)
85
+ html_to_pdf(html_content, pdf_path)
86
+
87
+ expiration_time = datetime.utcnow() + timedelta(minutes=FILE_RETENTION_MINUTES)
88
+ converted_files[file_id] = {
89
+ 'file_path': pdf_path,
90
+ 'mime_type': 'application/pdf',
91
+ 'expires_at': expiration_time,
92
+ 'extension': 'pdf'
93
+ }
94
+
95
+ return ConversionResponse(
96
+ download_url=get_download_url(file_id),
97
+ expires_at=expiration_time
98
+ )
99
  except Exception as e:
100
+ if os.path.exists(pdf_path):
101
+ os.unlink(pdf_path)
102
  raise HTTPException(status_code=500, detail=str(e))
103
 
104
+ @router.post("/convert/md_to_docx", response_model=ConversionResponse)
105
+ async def convert_md_to_docx(
106
+ request: Request,
107
+ markdown_req: MarkdownRequest,
108
+ background_tasks: BackgroundTasks
109
+ ):
110
  ensure_temp_dir()
111
+ cleanup_expired_files(background_tasks)
 
112
 
113
+ pdf_path = generate_temp_filepath("pdf")[0]
114
+ docx_path, file_id = generate_temp_filepath("docx")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  try:
117
+ html_content = markdown_to_html(markdown_req.markdown_content)
118
+ html_to_pdf(html_content, pdf_path)
119
+ pdf_to_docx(pdf_path, docx_path)
120
+
121
+ # Clean up intermediate PDF
122
+ os.unlink(pdf_path)
123
+
124
+ expiration_time = datetime.utcnow() + timedelta(minutes=FILE_RETENTION_MINUTES)
125
+ converted_files[file_id] = {
126
+ 'file_path': docx_path,
127
+ 'mime_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
128
+ 'expires_at': expiration_time,
129
+ 'extension': 'docx'
130
+ }
131
+
132
+ return ConversionResponse(
133
+ download_url=get_download_url(file_id),
134
+ expires_at=expiration_time
135
+ )
136
  except Exception as e:
137
+ for path in [pdf_path, docx_path]:
138
+ if os.path.exists(path):
139
+ os.unlink(path)
140
  raise HTTPException(status_code=500, detail=str(e))
141
 
142
+ @router.get("/download/{file_id}")
143
+ async def download_file(
144
+ file_id: str,
145
+ background_tasks: BackgroundTasks
146
+ ):
147
+ cleanup_expired_files(background_tasks)
148
 
149
+ file_info = converted_files.get(file_id)
150
+ if not file_info:
151
+ raise HTTPException(status_code=404, detail="File not found or expired")
152
+
153
+ if datetime.utcnow() > file_info['expires_at']:
154
+ converted_files.pop(file_id, None)
155
+ if os.path.exists(file_info['file_path']):
156
+ os.unlink(file_info['file_path'])
157
+ raise HTTPException(status_code=404, detail="File has expired")
158
+
159
+ return FileResponse(
160
+ file_info['file_path'],
161
+ media_type=file_info['mime_type'],
162
+ filename=f"converted_{file_id}.{file_info['extension']}"
163
+ )