Spaces:

rishabhsetiya
/

CAIAssignmentGradio

Sleeping

App Files Files Community

rishabhsetiya commited on Aug 24, 2025

Commit

9db2d7f

verified ·

1 Parent(s): 13ea108

Update generate_indexes.py

Browse files

Files changed (1) hide show

generate_indexes.py +44 -6

generate_indexes.py CHANGED Viewed

@@ -45,14 +45,52 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
         chunks.append(" ".join(current_chunk))
     return chunks
 def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
     """Extract tables from financial PDF into structured row-year-value dicts."""
-    tables = tabula.read_pdf(
-        pdf_path,
-        pages=pages,
-        multiple_tables=True,
-        pandas_options={'dtype': str}
-    )
     table_rows = []
     row_id = 0

         chunks.append(" ".join(current_chunk))
     return chunks
+import pdfplumber
+import pandas as pd
+def read_pdf_tables(pdf_path, pages="all"):
+    """
+    Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
+    Args:
+        pdf_path (str): Path to the PDF file
+        pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
+    Returns:
+        List[pd.DataFrame]: List of tables extracted from the PDF
+    """
+    tables = []
+    with pdfplumber.open(pdf_path) as pdf:
+        if pages == "all":
+            page_numbers = range(len(pdf.pages))
+        else:
+            # Convert 1-based to 0-based indices
+            page_numbers = [p-1 for p in pages]
+        for i in page_numbers:
+            page = pdf.pages[i]
+            # Extract tables from this page
+            page_tables = page.extract_tables()
+            for table in page_tables:
+                if table:  # ignore empty tables
+                    df = pd.DataFrame(table[1:], columns=table[0])  # first row as header
+                    # convert all columns to str to mimic pandas_options={'dtype': str}
+                    df = df.astype(str)
+                    tables.append(df)
+    return tables
 def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
     """Extract tables from financial PDF into structured row-year-value dicts."""
+    # tables = tabula.read_pdf(
+    #     pdf_path,
+    #     pages=pages,
+    #     multiple_tables=True,
+    #     pandas_options={'dtype': str}
+    # )
+    tables = read_pdf_tables(pdf_path)
     table_rows = []
     row_id = 0