File size: 3,094 Bytes
a243c57
4e3c340
 
 
a243c57
 
 
 
4e3c340
 
 
 
 
 
a243c57
 
4e3c340
a243c57
 
 
 
4e3c340
a243c57
 
 
 
 
 
 
 
 
 
 
 
4e3c340
a243c57
4e3c340
a243c57
 
 
 
 
 
 
 
 
4e3c340
 
 
 
 
 
 
 
 
 
a243c57
4e3c340
 
 
 
 
 
 
 
 
 
 
 
 
 
a243c57
 
 
 
 
4e3c340
 
a243c57
 
 
 
 
4e3c340
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import io
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware

# --- Initialize the FastAPI app ---
app = FastAPI(
    title="PDF OCR Extractor API",
    description="An API that uses Tesseract OCR to extract text from PDF files.",
    version="1.0.0"
)

# --- Configure CORS ---
# Allows your frontend web page to communicate with this API.
origins = [
    "https://clarifyai.pages.dev", # Your production frontend
    "http://127.0.0.1:5500",      # Local development server
    "http://localhost:5500",
    "*" # In development, a wildcard can be useful. For production, be more specific.
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"], # Allows all methods (GET, POST, etc.)
    allow_headers=["*"], # Allows all headers
)

# --- Define the API Endpoint ---
@app.post("/extract-text")
async def extract_text_from_pdf_ocr(file: UploadFile = File(...)):
    """
    Accepts a PDF file, extracts its text content using OCR, and returns it.
    """
    # Ensure the uploaded file is a PDF
    if file.content_type != "application/pdf":
        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")

    try:
        # Read the uploaded file into memory
        pdf_data = await file.read()
        
        # --- OCR LOGIC START ---
        # This replaces the old pdfplumber logic
        
        full_text = []
        # Open the PDF from the in-memory data
        with fitz.open(stream=pdf_data, filetype="pdf") as doc:
            for i, page in enumerate(doc):
                # 1. Render the page to a high-resolution image (pixmap)
                #    DPI is critical for OCR accuracy. 300 is a good standard.
                pix = page.get_pixmap(dpi=300)
                
                # 2. Convert the pixmap to a PIL Image object
                img_data = pix.tobytes("png")
                image = Image.open(io.BytesIO(img_data))
                
                # 3. Use Tesseract to extract text from the image
                #    Specify language if known, e.g., lang='eng'
                page_text = pytesseract.image_to_string(image)
                
                if page_text:
                    full_text.append(page_text)
        
        # Join all pages' text with a clear separator
        final_text = "\n\n--- Page Break ---\n\n".join(full_text)
        # --- OCR LOGIC END ---

        # Return the extracted text in a JSON response
        return JSONResponse(content={"text": final_text})

    except Exception as e:
        # Handle potential errors during OCR processing
        print(f"An error occurred during OCR processing: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {e}")

# A simple root endpoint to confirm the server is running
@app.get("/")
def read_root():
    return {"status": "PDF OCR extraction service is running."}