redhairedshanks1 commited on
Commit
bb7d43f
·
verified ·
1 Parent(s): 6758b3d

Create extract_table.py

Browse files
Files changed (1) hide show
  1. services/extract_table.py +86 -0
services/extract_table.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF (not used in this script, but often for PDF handling)
3
+ import pdfplumber # For extracting tables from PDFs
4
+ import pandas as pd # For handling tabular data (CSV, Excel)
5
+ from docx.api import Document # For reading DOCX documents
6
+ import logging
7
+
8
+ # Setup logging to ensure messages are visible during standalone use
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def extract_tables_from_file(file, start_page=None, end_page=None, filename=None):
13
+ """
14
+ Extracts tables from a document, depending on its file extension.
15
+
16
+ Supports PDF, DOCX, CSV, XLS/XLSX formats.
17
+
18
+ Args:
19
+ file: File-like object.
20
+ start_page (int, optional): Start page for partial PDF parsing.
21
+ end_page (int, optional): End page for partial PDF parsing.
22
+ filename (str, optional): Filename used to determine file extension.
23
+
24
+ Returns:
25
+ str: All extracted tables formatted as a single string.
26
+ """
27
+ ext = os.path.splitext(filename or "")[-1].lower()
28
+ tables = []
29
+
30
+ # ------------------ PDF (.pdf) Extraction ------------------ #
31
+ if ext == ".pdf":
32
+ try:
33
+ with pdfplumber.open(file.name) as pdf:
34
+ total_pages = len(pdf.pages)
35
+ start = max(start_page or 1, 1)
36
+ end = min(end_page or total_pages, total_pages)
37
+
38
+ for i, page in enumerate(pdf.pages):
39
+ page_num = i + 1
40
+ if not (start <= page_num <= end):
41
+ continue
42
+ try:
43
+ for table in page.extract_tables():
44
+ # Join each row into a string with pipe-separated columns
45
+ rows = [" | ".join(cell or "" for cell in row) for row in table if row]
46
+ tables.append(f"Page {page_num} Table:\n" + "\n".join(rows))
47
+ except Exception as e:
48
+ logger.warning(f"PDF table extraction failed on page {page_num}: {e}")
49
+ except Exception as e:
50
+ logger.error(f"Failed to read PDF file: {e}")
51
+
52
+ # ------------------ DOCX (.docx) Extraction ------------------ #
53
+ elif ext == ".docx":
54
+ try:
55
+ doc = Document(file.name)
56
+ for t in doc.tables:
57
+ # Extract text from each table row
58
+ rows = [" | ".join(cell.text.strip() for cell in row.cells) for row in t.rows]
59
+ tables.append("\n".join(rows))
60
+ except Exception as e:
61
+ logger.error(f"DOCX table extraction failed: {e}")
62
+
63
+ # ------------------ CSV (.csv) Extraction ------------------ #
64
+ elif ext == ".csv":
65
+ try:
66
+ df = pd.read_csv(file.name)
67
+ tables.append(df.to_string(index=False)) # Convert DataFrame to string without index
68
+ except Exception as e:
69
+ logger.warning(f"CSV parsing error: {e}")
70
+
71
+ # ------------------ Excel (.xls, .xlsx) Extraction ------------------ #
72
+ elif ext in [".xls", ".xlsx"]:
73
+ try:
74
+ xl = pd.ExcelFile(file.name)
75
+ for s in xl.sheet_names:
76
+ sheet_df = xl.parse(s)
77
+ tables.append(f"Sheet: {s}\n{sheet_df.to_string(index=False)}")
78
+ except Exception as e:
79
+ logger.warning(f"Excel parsing error: {e}")
80
+
81
+ # ------------------ Unsupported File Type ------------------ #
82
+ else:
83
+ logger.warning(f"Unsupported file type: {ext}")
84
+
85
+ # Join all extracted tables into a single string separated by newlines
86
+ return "\n\n".join(tables)