purajith commited on
Commit
3419102
·
verified ·
1 Parent(s): 0b6a55b

Delete data_extraction.py

Browse files
Files changed (1) hide show
  1. data_extraction.py +0 -171
data_extraction.py DELETED
@@ -1,171 +0,0 @@
1
- import pandas as pd
2
- from docx import Document as DocxDocument # Avoids conflict with langchain's Document
3
- import csv
4
- import fitz # PyMuPDF for text extraction
5
- import camelot # Table extraction
6
- from langchain.schema import Document # Structured document format
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- import os
9
- from dotenv import load_dotenv
10
- load_dotenv()
11
- import warnings
12
- warnings.filterwarnings("ignore")
13
- # Ensure the API key is properly set
14
- openai_key = os.getenv("openai_key")
15
- os.environ["OPENAI_API_KEY"] = openai_key # Ensure 'openai_key' is defined
16
- # Function to read and process .docx files
17
- def extract_text_and_tables(docx_path):
18
- doc = DocxDocument(docx_path) # Use renamed import to avoid conflict
19
-
20
- # Extract text
21
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
22
-
23
- # Extract tables
24
- tables = []
25
- for table in doc.tables:
26
- table_data = []
27
- for row in table.rows:
28
- row_data = [cell.text.strip() for cell in row.cells]
29
- table_data.append(row_data)
30
- tables.append(Document(page_content=str(table_data), metadata={"source": docx_path})) # Store as Document object
31
-
32
- return text, tables
33
-
34
- # Function to read and process .xlsx (Excel) files
35
- def read_excel(file_path):
36
- print(f"Reading Excel file: {file_path}")
37
- excel_data = pd.read_excel(file_path, sheet_name=None)
38
-
39
- text = []
40
- for sheet_name, df in excel_data.items():
41
- text.append(f"Sheet: {sheet_name}")
42
- for row in df.values:
43
- row_text = " | ".join(str(cell) for cell in row)
44
- text.append(row_text)
45
-
46
- return text
47
-
48
- # Function to read and process .csv files
49
- def read_csv(file_path):
50
- print(f"Reading CSV file: {file_path}")
51
-
52
- text = []
53
- with open(file_path, mode='r') as file:
54
- reader = csv.reader(file)
55
- for row in reader:
56
- row_text = " | ".join(row)
57
- text.append(row_text)
58
-
59
- return text
60
-
61
- # Function to extract text from PDFs
62
- def extract_text(pdf_path):
63
- """Extracts text from a PDF file and returns it as a list of Document objects."""
64
- documents = []
65
- try:
66
- doc = fitz.open(pdf_path)
67
- for page_num, page in enumerate(doc, start=1):
68
- text = page.get_text("text").strip()
69
- if text:
70
- documents.append(Document(
71
- page_content=text,
72
- metadata={"source": pdf_path, "page": page_num}
73
- ))
74
- except Exception as e:
75
- print(f"❌ Error extracting text: {e}")
76
- return documents
77
-
78
- # Function to extract tables from PDFs
79
- def extract_tables(pdf_path):
80
- """Extracts tables from a PDF using Camelot and returns them as Document objects."""
81
- table_documents = []
82
- try:
83
- tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
84
-
85
- if tables.n == 0:
86
- print(f"⚠️ No tables found in {pdf_path}. Adding dummy data for testing.")
87
- return [Document(page_content="Dummy Table: No real data found", metadata={"source": pdf_path, "table_index": 0})]
88
-
89
- for i in range(tables.n):
90
- table_text = tables[i].df.to_string()
91
- table_documents.append(Document(
92
- page_content=table_text,
93
- metadata={"source": pdf_path, "table_index": i+1}
94
- ))
95
-
96
- except Exception as e:
97
- print(f"❌ Error extracting tables from {pdf_path}: {e}")
98
- return [Document(page_content="Dummy Table: Extraction error", metadata={"source": pdf_path, "table_index": -1})]
99
-
100
- return table_documents
101
-
102
- # Function to chunk tables (for docx and pdf)
103
- def chunk_table(documents, chunk_size=2):
104
- """Chunks table data row-wise from Document objects."""
105
- chunks = []
106
- for doc in documents:
107
- if isinstance(doc, Document): # Ensure it's a Document object
108
- table_text = doc.page_content # Extract the actual text
109
-
110
- rows = table_text.split("\n") # Split into rows
111
- for i in range(0, len(rows), chunk_size):
112
- chunk = "\n".join(rows[i:i+chunk_size]) # Group rows
113
- chunks.append(Document(page_content=chunk, metadata=doc.metadata)) # Preserve metadata
114
-
115
- return chunks
116
-
117
- # Function to process .docx, .xlsx, .csv, and PDF files
118
- def process_files(file, text_chunk_size=1000, chunk_overlap=40, table_chunk_size=2):
119
- text = []
120
- tables = []
121
-
122
- # Process .docx file
123
- if file.endswith(".docx"):
124
- docx_text, docx_tables = extract_text_and_tables(file)
125
- text.append(docx_text)
126
- tables.extend(docx_tables)
127
-
128
- # Process .xlsx file
129
- if file.endswith((".xlsx", ".xls")):
130
- excel_text = read_excel(file)
131
- text.extend(excel_text)
132
-
133
- # Process .csv file
134
- if file.endswith(".csv"):
135
- csv_text = read_csv(file)
136
- text.extend(csv_text)
137
-
138
- # Process PDF file
139
- if file.endswith(".pdf"):
140
- pdf_text_documents = extract_text(file)
141
- pdf_table_documents = extract_tables(file)
142
- text.extend([doc.page_content for doc in pdf_text_documents])
143
-
144
- if pdf_table_documents: # Only add tables if they exist
145
- tables.extend(pdf_table_documents)
146
- else:
147
- print(f"⚠️ No tables found in {file}, skipping table embeddings.")
148
-
149
- # Chunk the tables **only if tables exist**
150
- table_chunks = chunk_table(tables, chunk_size=table_chunk_size) if tables else []
151
-
152
- # Chunk the text
153
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=text_chunk_size, chunk_overlap=chunk_overlap)
154
- text_chunks = text_splitter.split_documents([Document(page_content=t) for t in text]) if text else []
155
-
156
- combined_chunks = text_chunks + table_chunks
157
-
158
- return combined_chunks if combined_chunks else [] # Ensure no empty embeddings
159
-
160
- # Function to process multiple files
161
- # def data_processing(file_paths):
162
- # all_combined_chunks = {}
163
- # for file in file_paths:
164
- # print(f"Processing file: {file.split('/')[-1]}")
165
- # combined_chunks = process_files(file)
166
- # all_combined_chunks[file] = combined_chunks
167
- # return all_combined_chunks
168
-
169
- # # Example usage
170
- # file_paths = ["/content/Acceptable Use Policy.docx","/content/RiskAnalysisGuide.pdf"]
171
- # all_combined_chunks = data_processing(file_paths)