Create extract_table.py
Browse files- services/extract_table.py +86 -0
services/extract_table.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import fitz # PyMuPDF (not used in this script, but often for PDF handling)
|
| 3 |
+
import pdfplumber # For extracting tables from PDFs
|
| 4 |
+
import pandas as pd # For handling tabular data (CSV, Excel)
|
| 5 |
+
from docx.api import Document # For reading DOCX documents
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
# Setup logging to ensure messages are visible during standalone use
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
def extract_tables_from_file(file, start_page=None, end_page=None, filename=None):
|
| 13 |
+
"""
|
| 14 |
+
Extracts tables from a document, depending on its file extension.
|
| 15 |
+
|
| 16 |
+
Supports PDF, DOCX, CSV, XLS/XLSX formats.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
file: File-like object.
|
| 20 |
+
start_page (int, optional): Start page for partial PDF parsing.
|
| 21 |
+
end_page (int, optional): End page for partial PDF parsing.
|
| 22 |
+
filename (str, optional): Filename used to determine file extension.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
str: All extracted tables formatted as a single string.
|
| 26 |
+
"""
|
| 27 |
+
ext = os.path.splitext(filename or "")[-1].lower()
|
| 28 |
+
tables = []
|
| 29 |
+
|
| 30 |
+
# ------------------ PDF (.pdf) Extraction ------------------ #
|
| 31 |
+
if ext == ".pdf":
|
| 32 |
+
try:
|
| 33 |
+
with pdfplumber.open(file.name) as pdf:
|
| 34 |
+
total_pages = len(pdf.pages)
|
| 35 |
+
start = max(start_page or 1, 1)
|
| 36 |
+
end = min(end_page or total_pages, total_pages)
|
| 37 |
+
|
| 38 |
+
for i, page in enumerate(pdf.pages):
|
| 39 |
+
page_num = i + 1
|
| 40 |
+
if not (start <= page_num <= end):
|
| 41 |
+
continue
|
| 42 |
+
try:
|
| 43 |
+
for table in page.extract_tables():
|
| 44 |
+
# Join each row into a string with pipe-separated columns
|
| 45 |
+
rows = [" | ".join(cell or "" for cell in row) for row in table if row]
|
| 46 |
+
tables.append(f"Page {page_num} Table:\n" + "\n".join(rows))
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.warning(f"PDF table extraction failed on page {page_num}: {e}")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"Failed to read PDF file: {e}")
|
| 51 |
+
|
| 52 |
+
# ------------------ DOCX (.docx) Extraction ------------------ #
|
| 53 |
+
elif ext == ".docx":
|
| 54 |
+
try:
|
| 55 |
+
doc = Document(file.name)
|
| 56 |
+
for t in doc.tables:
|
| 57 |
+
# Extract text from each table row
|
| 58 |
+
rows = [" | ".join(cell.text.strip() for cell in row.cells) for row in t.rows]
|
| 59 |
+
tables.append("\n".join(rows))
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.error(f"DOCX table extraction failed: {e}")
|
| 62 |
+
|
| 63 |
+
# ------------------ CSV (.csv) Extraction ------------------ #
|
| 64 |
+
elif ext == ".csv":
|
| 65 |
+
try:
|
| 66 |
+
df = pd.read_csv(file.name)
|
| 67 |
+
tables.append(df.to_string(index=False)) # Convert DataFrame to string without index
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.warning(f"CSV parsing error: {e}")
|
| 70 |
+
|
| 71 |
+
# ------------------ Excel (.xls, .xlsx) Extraction ------------------ #
|
| 72 |
+
elif ext in [".xls", ".xlsx"]:
|
| 73 |
+
try:
|
| 74 |
+
xl = pd.ExcelFile(file.name)
|
| 75 |
+
for s in xl.sheet_names:
|
| 76 |
+
sheet_df = xl.parse(s)
|
| 77 |
+
tables.append(f"Sheet: {s}\n{sheet_df.to_string(index=False)}")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.warning(f"Excel parsing error: {e}")
|
| 80 |
+
|
| 81 |
+
# ------------------ Unsupported File Type ------------------ #
|
| 82 |
+
else:
|
| 83 |
+
logger.warning(f"Unsupported file type: {ext}")
|
| 84 |
+
|
| 85 |
+
# Join all extracted tables into a single string separated by newlines
|
| 86 |
+
return "\n\n".join(tables)
|