Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Query, HTTPException | |
| from extractous import Extractor, TesseractOcrConfig | |
| from bs4 import BeautifulSoup | |
| app = FastAPI() | |
| def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")): | |
| if not link.startswith(("http://", "https://")): | |
| raise HTTPException(status_code=400, detail="Invalid URL format") | |
| extractor = Extractor().set_ocr_config(TesseractOcrConfig()) | |
| extractor = extractor.set_xml_output(True) | |
| content, metadata = extractor.extract_url_to_string(link) | |
| soup = BeautifulSoup(content, 'html.parser') | |
| pages = soup.find_all('div', class_='page') | |
| pages_text = [p.get_text() for p in pages] | |
| return {"received_link": link, "content": pages_text} | |