vasilee commited on
Commit
1a755c0
·
1 Parent(s): 63069dd

extract text and tables

Browse files
Files changed (4) hide show
  1. app.py +58 -14
  2. requirements.txt +2 -1
  3. test_pdf_endpoint.py +6 -1
  4. utils.py +52 -0
app.py CHANGED
@@ -1,34 +1,45 @@
1
  from fastapi import FastAPI, File, UploadFile
2
  from fastapi.responses import JSONResponse
3
- import pypdfium2 as pdfium
4
  import base64
5
  import re
6
 
7
  app = FastAPI()
8
 
9
- def extract_text_from_pdf(pdf_bytes: bytes) -> str:
 
 
 
 
10
  """
11
- Extract text from PDF bytes using pypdfium2
12
  """
13
- pdf_file = pdfium.PdfDocument(pdf_bytes)
14
- text_parts = []
 
15
 
16
  try:
17
- for page in pdf_file:
18
- textpage = page.get_textpage()
19
- text = textpage.get_text_range()
20
- text_parts.append(text)
 
 
 
 
 
 
 
21
  finally:
22
- pdf_file.close()
23
 
24
- return "\n".join(text_parts)
25
 
26
  @app.get("/")
27
  def greet_json():
28
  return {"Hello": "World!"}
29
 
30
  @app.post("/extract-text")
31
- async def extract_pdf_text(file: UploadFile = File(...)):
32
  """
33
  Endpoint to extract text from uploaded PDF file
34
  """
@@ -42,9 +53,21 @@ async def extract_pdf_text(file: UploadFile = File(...)):
42
  # Read the file content
43
  content = await file.read()
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  try:
46
  # Extract text from PDF
47
- extracted_text = extract_text_from_pdf(content)
48
 
49
  return {
50
  "filename": file.filename,
@@ -74,6 +97,27 @@ async def extract_pdf_text_base64(data: dict):
74
  # Extract filename if provided
75
  filename = data.get('filename', 'unknown.pdf')
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  try:
78
  # Handle data URL format (e.g., "data:application/pdf;base64,...")
79
  if base64_string.startswith('data:'):
@@ -90,7 +134,7 @@ async def extract_pdf_text_base64(data: dict):
90
  pdf_bytes = base64.b64decode(base64_string)
91
 
92
  # Extract text from PDF
93
- extracted_text = extract_text_from_pdf(pdf_bytes)
94
 
95
  return {
96
  "filename": filename,
 
1
  from fastapi import FastAPI, File, UploadFile
2
  from fastapi.responses import JSONResponse
 
3
  import base64
4
  import re
5
 
6
  app = FastAPI()
7
 
8
+ from io import BytesIO
9
+ from gmft.pdf_bindings import PyPDFium2Document
10
+ from utils import get_page_text_with_tables, detector, formatter
11
+
12
+ def extract_text_from_pdf(pdf_bytes: bytes, page_numbers=None) -> str:
13
  """
14
+ Extract text from PDF bytes using gmft without temporary files
15
  """
16
+ # Create a PyPDFium2Document directly from bytes
17
+ doc = PyPDFium2Document(pdf_bytes)
18
+ page_set = set(page_numbers if page_numbers else list(range(len(doc))))
19
 
20
  try:
21
+ pages = []
22
+ for page_num, page in enumerate(doc):
23
+ if not page_num in page_set:
24
+ continue
25
+ try:
26
+ tables = detector.extract(page)
27
+ fmt_tables = [formatter.extract(table, margin=(0, 0, 0, 0)) for table in tables]
28
+ page_text = get_page_text_with_tables(page, fmt_tables)
29
+ pages.append(page_text)
30
+ finally:
31
+ page.close()
32
  finally:
33
+ doc.close()
34
 
35
+ return pages
36
 
37
  @app.get("/")
38
  def greet_json():
39
  return {"Hello": "World!"}
40
 
41
  @app.post("/extract-text")
42
+ async def extract_pdf_text(file: UploadFile = File(...), page_numbers: str = None):
43
  """
44
  Endpoint to extract text from uploaded PDF file
45
  """
 
53
  # Read the file content
54
  content = await file.read()
55
 
56
+ # Parse page_numbers if provided
57
+ parsed_page_numbers = None
58
+ if page_numbers:
59
+ try:
60
+ # Convert comma-separated string to list of integers
61
+ parsed_page_numbers = [int(p.strip()) for p in page_numbers.split(',') if p.strip()]
62
+ except ValueError:
63
+ return JSONResponse(
64
+ status_code=400,
65
+ content={"error": "Invalid page_numbers format. Use comma-separated integers."}
66
+ )
67
+
68
  try:
69
  # Extract text from PDF
70
+ extracted_text = extract_text_from_pdf(content, parsed_page_numbers)
71
 
72
  return {
73
  "filename": file.filename,
 
97
  # Extract filename if provided
98
  filename = data.get('filename', 'unknown.pdf')
99
 
100
+ # Extract page_numbers if provided
101
+ page_numbers = data.get('page_numbers')
102
+ parsed_page_numbers = None
103
+ if page_numbers:
104
+ try:
105
+ # Handle both string and list formats
106
+ if isinstance(page_numbers, str):
107
+ parsed_page_numbers = [int(p.strip()) for p in page_numbers.split(',') if p.strip()]
108
+ elif isinstance(page_numbers, list):
109
+ parsed_page_numbers = [int(p) for p in page_numbers if isinstance(p, (int, str))]
110
+ else:
111
+ return JSONResponse(
112
+ status_code=400,
113
+ content={"error": "Invalid page_numbers format. Use comma-separated integers or array."}
114
+ )
115
+ except (ValueError, TypeError):
116
+ return JSONResponse(
117
+ status_code=400,
118
+ content={"error": "Invalid page_numbers format. Use comma-separated integers or array."}
119
+ )
120
+
121
  try:
122
  # Handle data URL format (e.g., "data:application/pdf;base64,...")
123
  if base64_string.startswith('data:'):
 
134
  pdf_bytes = base64.b64decode(base64_string)
135
 
136
  # Extract text from PDF
137
+ extracted_text = extract_text_from_pdf(pdf_bytes, parsed_page_numbers)
138
 
139
  return {
140
  "filename": filename,
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  fastapi[all]
2
  aiohttp
3
  uvicorn[standard]
4
- pypdfium2
 
 
1
  fastapi[all]
2
  aiohttp
3
  uvicorn[standard]
4
+ pypdfium2
5
+ gmft
test_pdf_endpoint.py CHANGED
@@ -6,10 +6,11 @@ import base64
6
  async def test_pdf_extraction():
7
  # Check if filename is provided as argument
8
  if len(sys.argv) < 2:
9
- print("Usage: python test_pdf_endpoint.py <pdf_filename>")
10
  return
11
 
12
  pdf_filename = sys.argv[1]
 
13
 
14
  # Read the PDF file
15
  try:
@@ -25,6 +26,8 @@ async def test_pdf_extraction():
25
  # Test regular file upload endpoint
26
  print("\n--- Testing file upload endpoint ---")
27
  url = "http://localhost:8000/extract-text"
 
 
28
 
29
  try:
30
  async with aiohttp.ClientSession() as session:
@@ -61,6 +64,8 @@ async def test_pdf_extraction():
61
  "file": base64_string,
62
  "filename": pdf_filename
63
  }
 
 
64
 
65
  async with aiohttp.ClientSession() as session:
66
  async with session.post(
 
6
  async def test_pdf_extraction():
7
  # Check if filename is provided as argument
8
  if len(sys.argv) < 2:
9
+ print("Usage: python test_pdf_endpoint.py <pdf_filename> [page_numbers]")
10
  return
11
 
12
  pdf_filename = sys.argv[1]
13
+ page_numbers = sys.argv[2] if len(sys.argv) > 2 else None
14
 
15
  # Read the PDF file
16
  try:
 
26
  # Test regular file upload endpoint
27
  print("\n--- Testing file upload endpoint ---")
28
  url = "http://localhost:8000/extract-text"
29
+ if page_numbers:
30
+ url += f"?page_numbers={page_numbers}"
31
 
32
  try:
33
  async with aiohttp.ClientSession() as session:
 
64
  "file": base64_string,
65
  "filename": pdf_filename
66
  }
67
+ if page_numbers:
68
+ payload["page_numbers"] = page_numbers
69
 
70
  async with aiohttp.ClientSession() as session:
71
  async with session.post(
utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gmft.formatters.base import FormattedTable
2
+ from gmft.formatters.page.base import FormattedPage
3
+ from gmft.pdf_bindings.base import BasePage
4
+
5
+ from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig
6
+ from gmft.pdf_bindings import PyPDFium2Document
7
+
8
+ detector = AutoTableDetector()
9
+ formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False))
10
+
11
+
12
+ def get_page_text_with_tables(
13
+ page: BasePage, tables: list[FormattedTable]
14
+ ) -> FormattedPage:
15
+ if not tables:
16
+ return page._get_text_with_breaks()
17
+
18
+ text_builder = []
19
+ done = [False for _ in tables]
20
+ for (
21
+ x0,
22
+ y0,
23
+ x1,
24
+ y1,
25
+ word,
26
+ blockno,
27
+ lineno,
28
+ wordno,
29
+ ) in page._get_positions_and_text_and_breaks():
30
+ for j, table in enumerate(tables):
31
+ if table.rect.is_intersecting((x0, y0, x1, y1)):
32
+ if not done[j]:
33
+ try:
34
+ table_content = table.df().fillna("").to_latex(index=False)
35
+ text_builder.append(f"\n{table_content}\n")
36
+ except:
37
+ # it throws errors when tables have no text
38
+ pass
39
+ done[j] = True
40
+ break
41
+ else:
42
+ # no table found
43
+ if wordno == 0:
44
+ text_builder.append("\n")
45
+ if lineno == 0:
46
+ text_builder.append("\n")
47
+ else:
48
+ text_builder.append(" ")
49
+ text_builder.append(word)
50
+
51
+ page_content = "".join(text_builder).lstrip()
52
+ return page_content