clinical-study-extraction / backend /app /api /tabula_table_extraction.py
mmrech's picture
Upload folder using huggingface_hub
fd110c2 verified
"""
Tabula-based table extraction module
Java-based table extraction with good accuracy for various PDF formats
"""
import tabula
from typing import List, Dict, Any, Optional
import logging
import pandas as pd
logger = logging.getLogger(__name__)
class TabulaTableExtractor:
"""Extract tables using Tabula library"""
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
def extract_tables_from_page(self, page_num: int) -> List[Dict[str, Any]]:
"""
Extract tables from a specific page using Tabula
Args:
page_num: Page number (1-indexed)
Returns:
List of table dictionaries with markdown and metadata
"""
try:
# Extract tables as DataFrames
dfs = tabula.read_pdf(
self.pdf_path,
pages=page_num,
multiple_tables=True,
pandas_options={'header': None} # Let Tabula detect headers
)
if not dfs or len(dfs) == 0:
logger.info(f"No tables found on page {page_num} with Tabula")
return []
extracted_tables = []
for idx, df in enumerate(dfs):
# Skip empty dataframes
if df.empty:
continue
# Convert to markdown
markdown = self._dataframe_to_markdown(df)
extracted_tables.append({
"table_index": idx,
"page": page_num,
"markdown": markdown,
"rows": len(df),
"cols": len(df.columns),
"extraction_method": "tabula"
})
logger.info(f"Extracted {len(extracted_tables)} tables from page {page_num} using Tabula")
return extracted_tables
except Exception as e:
logger.error(f"Error extracting tables with Tabula from page {page_num}: {e}")
return []
def _dataframe_to_markdown(self, df: pd.DataFrame) -> str:
"""Convert pandas DataFrame to markdown table"""
try:
# Clean up the dataframe
df = df.fillna('') # Replace NaN with empty string
# Try to use first row as header if it looks like headers
if len(df) > 0:
first_row = df.iloc[0]
if all(isinstance(val, str) and val.strip() for val in first_row):
# First row looks like headers
df.columns = first_row
df = df[1:]
markdown_lines = []
# Header
headers = [str(h) for h in df.columns]
markdown_lines.append("| " + " | ".join(headers) + " |")
markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
# Rows
for _, row in df.iterrows():
cells = [str(cell).strip() for cell in row]
markdown_lines.append("| " + " | ".join(cells) + " |")
return "\n".join(markdown_lines)
except Exception as e:
logger.error(f"Error converting DataFrame to markdown: {e}")
return str(df)
def extract_all_tables(self) -> List[Dict[str, Any]]:
"""Extract tables from all pages"""
all_tables = []
try:
# Extract from all pages
dfs = tabula.read_pdf(
self.pdf_path,
pages='all',
multiple_tables=True,
pandas_options={'header': None}
)
for idx, df in enumerate(dfs):
if df.empty:
continue
all_tables.append({
"table_index": idx,
"page": "unknown", # Tabula doesn't provide page info in batch mode
"markdown": self._dataframe_to_markdown(df),
"rows": len(df),
"cols": len(df.columns),
"extraction_method": "tabula"
})
logger.info(f"Extracted {len(all_tables)} tables from entire document using Tabula")
except Exception as e:
logger.error(f"Error extracting all tables with Tabula: {e}")
return all_tables
def extract_tables_tabula(pdf_path: str, page_num: Optional[int] = None) -> List[Dict[str, Any]]:
"""
Main function to extract tables using Tabula
Args:
pdf_path: Path to PDF file
page_num: Specific page number (1-indexed), or None for all pages
Returns:
List of extracted tables with markdown and metadata
"""
extractor = TabulaTableExtractor(pdf_path)
if page_num is not None:
return extractor.extract_tables_from_page(page_num)
else:
return extractor.extract_all_tables()