Spaces:
Sleeping
Sleeping
Update generate_indexes.py
Browse files- generate_indexes.py +44 -6
generate_indexes.py
CHANGED
|
@@ -45,14 +45,52 @@ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
|
|
| 45 |
chunks.append(" ".join(current_chunk))
|
| 46 |
return chunks
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
|
| 49 |
"""Extract tables from financial PDF into structured row-year-value dicts."""
|
| 50 |
-
tables = tabula.read_pdf(
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
)
|
|
|
|
| 56 |
|
| 57 |
table_rows = []
|
| 58 |
row_id = 0
|
|
|
|
| 45 |
chunks.append(" ".join(current_chunk))
|
| 46 |
return chunks
|
| 47 |
|
| 48 |
+
import pdfplumber
|
| 49 |
+
import pandas as pd
|
| 50 |
+
|
| 51 |
+
def read_pdf_tables(pdf_path, pages="all"):
|
| 52 |
+
"""
|
| 53 |
+
Extracts tables from a PDF using pdfplumber, similar to tabula.read_pdf(..., multiple_tables=True)
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
pdf_path (str): Path to the PDF file
|
| 57 |
+
pages (str or list): Pages to extract from ("all" or list of page numbers, 1-based)
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List[pd.DataFrame]: List of tables extracted from the PDF
|
| 61 |
+
"""
|
| 62 |
+
tables = []
|
| 63 |
+
|
| 64 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 65 |
+
if pages == "all":
|
| 66 |
+
page_numbers = range(len(pdf.pages))
|
| 67 |
+
else:
|
| 68 |
+
# Convert 1-based to 0-based indices
|
| 69 |
+
page_numbers = [p-1 for p in pages]
|
| 70 |
+
|
| 71 |
+
for i in page_numbers:
|
| 72 |
+
page = pdf.pages[i]
|
| 73 |
+
# Extract tables from this page
|
| 74 |
+
page_tables = page.extract_tables()
|
| 75 |
+
|
| 76 |
+
for table in page_tables:
|
| 77 |
+
if table: # ignore empty tables
|
| 78 |
+
df = pd.DataFrame(table[1:], columns=table[0]) # first row as header
|
| 79 |
+
# convert all columns to str to mimic pandas_options={'dtype': str}
|
| 80 |
+
df = df.astype(str)
|
| 81 |
+
tables.append(df)
|
| 82 |
+
|
| 83 |
+
return tables
|
| 84 |
+
|
| 85 |
def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
|
| 86 |
"""Extract tables from financial PDF into structured row-year-value dicts."""
|
| 87 |
+
# tables = tabula.read_pdf(
|
| 88 |
+
# pdf_path,
|
| 89 |
+
# pages=pages,
|
| 90 |
+
# multiple_tables=True,
|
| 91 |
+
# pandas_options={'dtype': str}
|
| 92 |
+
# )
|
| 93 |
+
tables = read_pdf_tables(pdf_path)
|
| 94 |
|
| 95 |
table_rows = []
|
| 96 |
row_id = 0
|