Spaces:
Sleeping
Sleeping
text split per page
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from fastapi import FastAPI, Query, HTTPException
|
| 2 |
from extractous import Extractor, TesseractOcrConfig
|
|
|
|
| 3 |
|
| 4 |
app = FastAPI()
|
| 5 |
|
|
@@ -8,6 +9,9 @@ def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")):
|
|
| 8 |
if not link.startswith(("http://", "https://")):
|
| 9 |
raise HTTPException(status_code=400, detail="Invalid URL format")
|
| 10 |
extractor = Extractor().set_ocr_config(TesseractOcrConfig())
|
| 11 |
-
extractor = extractor.set_xml_output(
|
| 12 |
content, metadata = extractor.extract_url_to_string(link)
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI, Query, HTTPException
|
| 2 |
from extractous import Extractor, TesseractOcrConfig
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
|
| 5 |
app = FastAPI()
|
| 6 |
|
|
|
|
| 9 |
if not link.startswith(("http://", "https://")):
|
| 10 |
raise HTTPException(status_code=400, detail="Invalid URL format")
|
| 11 |
extractor = Extractor().set_ocr_config(TesseractOcrConfig())
|
| 12 |
+
extractor = extractor.set_xml_output(True)
|
| 13 |
content, metadata = extractor.extract_url_to_string(link)
|
| 14 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 15 |
+
pages = soup.find_all('div', class_='page')
|
| 16 |
+
pages_text = [p.get_text() for p in pages]
|
| 17 |
+
return {"received_link": link, "content": pages_text}
|