Spaces:
Sleeping
Sleeping
Rivalcoder
commited on
Commit
·
45a9a23
1
Parent(s):
09c633c
Add application file
Browse files- Dockerfile +9 -1
- app.py +30 -2
- requirements.txt +2 -0
Dockerfile
CHANGED
|
@@ -1,16 +1,24 @@
|
|
| 1 |
FROM python:3.9
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
| 4 |
|
|
|
|
| 5 |
RUN useradd -m -u 1000 user
|
| 6 |
USER user
|
| 7 |
ENV PATH="/home/user/.local/bin:$PATH"
|
| 8 |
|
|
|
|
| 9 |
WORKDIR /app
|
| 10 |
|
|
|
|
| 11 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 12 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 13 |
|
|
|
|
| 14 |
COPY --chown=user . /app
|
| 15 |
|
|
|
|
| 16 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.9
|
| 2 |
|
| 3 |
+
# Install system dependencies (Tesseract for OCR)
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
tesseract-ocr \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
+
# Create and switch to a non-root user
|
| 9 |
RUN useradd -m -u 1000 user
|
| 10 |
USER user
|
| 11 |
ENV PATH="/home/user/.local/bin:$PATH"
|
| 12 |
|
| 13 |
+
# Set working directory
|
| 14 |
WORKDIR /app
|
| 15 |
|
| 16 |
+
# Copy and install Python dependencies
|
| 17 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 18 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 19 |
|
| 20 |
+
# Copy application code
|
| 21 |
COPY --chown=user . /app
|
| 22 |
|
| 23 |
+
# Start the app with Uvicorn
|
| 24 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
|
@@ -5,6 +5,9 @@ import pytesseract
|
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
app = FastAPI()
|
| 9 |
|
| 10 |
|
|
@@ -30,8 +33,9 @@ async def home():
|
|
| 30 |
<p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
|
| 31 |
<h2>Available endpoints:</h2>
|
| 32 |
<ul>
|
| 33 |
-
<li><b>POST /extract-text</b> - Extract text from PDF pages.</li>
|
| 34 |
-
<li><b>POST /extract-text-ocr</b> - Extract text including OCR from
|
|
|
|
| 35 |
</ul>
|
| 36 |
<p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
|
| 37 |
</div>
|
|
@@ -85,3 +89,27 @@ async def extract_text_ocr(file: UploadFile = File(...)):
|
|
| 85 |
|
| 86 |
except Exception as e:
|
| 87 |
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
| 7 |
|
| 8 |
+
from pdfminer.high_level import extract_pages
|
| 9 |
+
from pdfminer.layout import LTTextContainer
|
| 10 |
+
|
| 11 |
app = FastAPI()
|
| 12 |
|
| 13 |
|
|
|
|
| 33 |
<p>This API allows you to upload PDFs and extract text — including optional OCR for images.</p>
|
| 34 |
<h2>Available endpoints:</h2>
|
| 35 |
<ul>
|
| 36 |
+
<li><b>POST /extract-text</b> - Extract plain text from PDF pages.</li>
|
| 37 |
+
<li><b>POST /extract-text-ocr</b> - Extract text including OCR from image-based PDFs.</li>
|
| 38 |
+
<li><b>POST /extract-text-structured</b> - Extract structured text using pdfminer.</li>
|
| 39 |
</ul>
|
| 40 |
<p>Use a tool like <a href="https://www.postman.com/" target="_blank">Postman</a> or write your own client to send PDF files to the endpoints.</p>
|
| 41 |
</div>
|
|
|
|
| 89 |
|
| 90 |
except Exception as e:
|
| 91 |
return {"error": str(e)}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@app.post("/extract-text-structured")
|
| 95 |
+
async def extract_text_structured(file: UploadFile = File(...)):
|
| 96 |
+
try:
|
| 97 |
+
contents = await file.read()
|
| 98 |
+
|
| 99 |
+
# Save to temp file to use with extract_pages
|
| 100 |
+
import tempfile
|
| 101 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 102 |
+
tmp_file.write(contents)
|
| 103 |
+
temp_pdf_path = tmp_file.name
|
| 104 |
+
|
| 105 |
+
structured_text = ""
|
| 106 |
+
for i, page_layout in enumerate(extract_pages(temp_pdf_path)):
|
| 107 |
+
structured_text += f"\n\n--- Page {i + 1} ---\n\n"
|
| 108 |
+
for element in page_layout:
|
| 109 |
+
if isinstance(element, LTTextContainer):
|
| 110 |
+
structured_text += element.get_text()
|
| 111 |
+
|
| 112 |
+
return {"filename": file.filename, "text": structured_text}
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
return {"error": str(e)}
|
requirements.txt
CHANGED
|
@@ -4,3 +4,5 @@ PyMuPDF
|
|
| 4 |
python-multipart
|
| 5 |
pytesseract
|
| 6 |
Pillow
|
|
|
|
|
|
|
|
|
| 4 |
python-multipart
|
| 5 |
pytesseract
|
| 6 |
Pillow
|
| 7 |
+
pdfminer.six
|
| 8 |
+
|