madankn79 commited on
Commit
4a3cccb
·
1 Parent(s): e924673

Initial Commit 5.1.0

Browse files
Files changed (3) hide show
  1. .idea/.gitignore +10 -0
  2. app.py +157 -0
  3. requirements.txt +7 -0
.idea/.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Environment-dependent path to Maven home directory
7
+ /mavenHomeManager.xml
8
+ # Datasource local storage ignored files
9
+ /dataSources/
10
+ /dataSources.local.xml
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face-style async API using FastAPI
2
+ from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
3
+ from fastapi.responses import JSONResponse
4
+ from pydantic import BaseModel
5
+ import base64
6
+ import tempfile
7
+ import uuid
8
+ import os
9
+ import secrets
10
+ import aiohttp
11
+ import shutil
12
+ import pdfplumber
13
+ import pytesseract
14
+ from pdf2image import convert_from_bytes
15
+ import io
16
+ import os
17
+ from PIL import Image
18
+ import subprocess
19
+ from bs4 import BeautifulSoup
20
+
21
+ from pdf_processing import extract_pdf_to_html # Assume your function is modularized here
22
+
23
+ API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
24
+
25
+ app = FastAPI()
26
+
27
+ # In-memory storage for tokens or you can use Redis/DB
28
+ api_tokens = {"client-1": API_KEY}
29
+
30
+ class PDFRequest(BaseModel):
31
+ file_b64: str
32
+ callback_url: str
33
+ client_id: str
34
+ token: str
35
+
36
+ @app.post("/convert-pdf")
37
+ async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
38
+ # Auth check
39
+ if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
40
+ raise HTTPException(status_code=401, detail="Invalid API credentials")
41
+
42
+ # Save base64 PDF to a temp file
43
+ try:
44
+ pdf_bytes = base64.b64decode(payload.file_b64)
45
+ tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
46
+ with open(tmp_pdf_path, "wb") as f:
47
+ f.write(pdf_bytes)
48
+ except Exception as e:
49
+ raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
50
+
51
+ task_id = str(uuid.uuid4())
52
+
53
+ # Run conversion in background
54
+ background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
55
+
56
+ return JSONResponse({"status": "processing", "task_id": task_id})
57
+
58
+ async def handle_pdf_processing(pdf_path, callback_url, task_id):
59
+ try:
60
+ html, _ = extract_pdf_to_html(open(pdf_path, "rb"))
61
+ finally:
62
+ if os.path.exists(pdf_path):
63
+ os.remove(pdf_path)
64
+
65
+ # Post result to callback URL
66
+ try:
67
+ async with aiohttp.ClientSession() as session:
68
+ await session.post(callback_url, json={
69
+ "task_id": task_id,
70
+ "html": html,
71
+ })
72
+ except Exception as e:
73
+ print(f"Callback failed: {e}")
74
+
75
+ def extract_text_from_image(image: Image.Image) -> str:
76
+ # Save image temporarily
77
+ temp_img_path = "temp_math.png"
78
+ image.save(temp_img_path)
79
+ try:
80
+ # Run LaTeX-OCR CLI (assumes it's installed)
81
+ result = subprocess.run(
82
+ ["latexocr", temp_img_path],
83
+ capture_output=True,
84
+ text=True
85
+ )
86
+ return f"<pre>\\[{result.stdout.strip()}\\]</pre>" if result.returncode == 0 else ""
87
+ except Exception as e:
88
+ return f"<i>Error: {str(e)}</i>"
89
+
90
+ def extract_pdf_to_html(file):
91
+ if file is None:
92
+ return "<p>No file uploaded.</p>", ""
93
+
94
+ html_output = ""
95
+ docx_output = docx.Document()
96
+ toc = []
97
+
98
+ with pdfplumber.open(file.name) as pdf:
99
+ for page_num, page in enumerate(pdf.pages):
100
+ page_title = f"Page {page_num + 1}"
101
+ toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
102
+ html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
103
+ docx_output.add_heading(page_title, level=2)
104
+
105
+ # Extract text and add paragraphs
106
+ text = page.extract_text()
107
+ if text:
108
+ for line in text.split("\n"):
109
+ html_output += f"<p>{line}</p>\n"
110
+ docx_output.add_paragraph(line)
111
+ else:
112
+ html_output += "<p><i>No text detected on this page.</i></p>"
113
+
114
+ # Process images embedded in page (diagrams, formulas, etc)
115
+ for img_obj in page.images:
116
+ x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
117
+ page_x0, page_top, page_x1, page_bottom = page.bbox
118
+
119
+ # Clip bbox to page boundaries
120
+ crop_x0 = max(x0, page_x0)
121
+ crop_top = max(top, page_top)
122
+ crop_x1 = min(x1, page_x1)
123
+ crop_bottom = min(bottom, page_bottom)
124
+
125
+ try:
126
+ cropped = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)).to_image(resolution=300).original
127
+ except Exception as e:
128
+ # Skip if cropping fails
129
+ continue
130
+
131
+ # Run LaTeX OCR on image for math formulas
132
+ math_html = extract_text_from_image(cropped)
133
+ if math_html.strip():
134
+ html_output += f"<div>{math_html}</div>\n"
135
+ # Also add LaTeX text to docx as paragraph
136
+ docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
137
+
138
+ # Convert cropped image to base64 and embed in HTML
139
+ buffer = io.BytesIO()
140
+ cropped.save(buffer, format="PNG")
141
+ buffer.seek(0)
142
+ b64_img = base64.b64encode(buffer.read()).decode("utf-8")
143
+ html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
144
+
145
+ # Add image to docx
146
+ buffer.seek(0)
147
+ docx_output.add_picture(buffer, width=Inches(5))
148
+
149
+ full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
150
+ docx_path = "output.docx"
151
+ docx_output.save(docx_path)
152
+ return full_html, docx_path
153
+
154
+ # Secure with a basic API token system
155
+ @app.get("/health")
156
+ def health():
157
+ return {"status": "ok"}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pdfplumber
3
+ pytesseract
4
+ pdf2image
5
+ Pillow
6
+ beautifulsoup4
7
+ python-docx