Spaces:

aursalan
/

PDF-Reader

Sleeping

App Files Files Community

PDF-Reader / app.py

aursalan

Sending only text in json

a0eb5ea 5 months ago

raw

history blame contribute delete

2.61 kB

	import io
	from fastapi import FastAPI, File, UploadFile, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from pypdf import PdfReader
	import uvicorn

	# --- Initialize the FastAPI app ---
	app = FastAPI(
	title="PDF Text Extractor API",
	description="An API that uses PyPDF to extract text from PDF files.",
	version="1.0.0"
	)

	# --- Configure CORS ---
	# Allows your frontend web page to communicate with this API.
	origins = [
	"https://clarifyai.pages.dev", # Your production frontend
	"http://127.0.0.1:5500", # Local development server
	"http://localhost:5500",
	"*" # In development, a wildcard can be useful. For production, be more specific.
	]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"], # Allows all methods (GET, POST, etc.)
	allow_headers=["*"], # Allows all headers
	)

	# --- Define the API Endpoint ---
	@app.post("/extract-text")
	async def extract_text_from_pdf(file: UploadFile = File(...)):
	"""
	Accepts a PDF file, extracts its text content using PyPDF, and returns it.
	"""
	# Ensure the uploaded file is a PDF
	if file.content_type != "application/pdf":
	raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")

	try:
	# Read the uploaded file into an in-memory stream
	pdf_stream = io.BytesIO(await file.read())

	# --- PyPDF LOGIC START ---
	reader = PdfReader(pdf_stream)
	full_text = []
	for page in reader.pages:
	text = page.extract_text()
	if text:
	full_text.append(text.strip())

	if not full_text:
	final_text = "No text could be extracted. The PDF might be image-based or empty."
	else:
	final_text = "\n\n---\n\n".join(full_text)
	# --- PyPDF LOGIC END ---

	# Return the extracted text in a JSON response
	return JSONResponse(content={"text": final_text})

	except Exception as e:
	# Handle potential errors during PDF processing
	print(f"An error occurred during PDF processing: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {str(e)}")

	# A simple root endpoint to confirm the server is running
	@app.get("/")
	def read_root():
	return {"status": "PDF PyPDF extraction service is running."}

	# This part is for local development, Hugging Face Spaces will use its own server.
	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000)