rishabhsetiya commited on
Commit
9db2d7f
·
verified ·
1 Parent(s): 13ea108

Update generate_indexes.py

Browse files
Files changed (1) hide show
  1. generate_indexes.py +44 -6
generate_indexes.py CHANGED
@@ -45,14 +45,52 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
45
  chunks.append(" ".join(current_chunk))
46
  return chunks
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
49
  """Extract tables from financial PDF into structured row-year-value dicts."""
50
- tables = tabula.read_pdf(
51
- pdf_path,
52
- pages=pages,
53
- multiple_tables=True,
54
- pandas_options={'dtype': str}
55
- )
 
56
 
57
  table_rows = []
58
  row_id = 0
 
45
  chunks.append(" ".join(current_chunk))
46
  return chunks
47
 
48
+ import pdfplumber
49
+ import pandas as pd
50
+
51
+ def read_pdf_tables(pdf_path, pages="all"):
52
+ """
53
+ Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
54
+
55
+ Args:
56
+ pdf_path (str): Path to the PDF file
57
+ pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
58
+
59
+ Returns:
60
+ List[pd.DataFrame]: List of tables extracted from the PDF
61
+ """
62
+ tables = []
63
+
64
+ with pdfplumber.open(pdf_path) as pdf:
65
+ if pages == "all":
66
+ page_numbers = range(len(pdf.pages))
67
+ else:
68
+ # Convert 1-based to 0-based indices
69
+ page_numbers = [p-1 for p in pages]
70
+
71
+ for i in page_numbers:
72
+ page = pdf.pages[i]
73
+ # Extract tables from this page
74
+ page_tables = page.extract_tables()
75
+
76
+ for table in page_tables:
77
+ if table: # ignore empty tables
78
+ df = pd.DataFrame(table[1:], columns=table[0]) # first row as header
79
+ # convert all columns to str to mimic pandas_options={'dtype': str}
80
+ df = df.astype(str)
81
+ tables.append(df)
82
+
83
+ return tables
84
+
85
  def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
86
  """Extract tables from financial PDF into structured row-year-value dicts."""
87
+ # tables = tabula.read_pdf(
88
+ # pdf_path,
89
+ # pages=pages,
90
+ # multiple_tables=True,
91
+ # pandas_options={'dtype': str}
92
+ # )
93
+ tables = read_pdf_tables(pdf_path)
94
 
95
  table_rows = []
96
  row_id = 0