purajith commited on
Commit
5a004e3
·
verified ·
1 Parent(s): 90eee5a

Delete data_processing1

Browse files
Files changed (1) hide show
  1. data_processing1 +0 -257
data_processing1 DELETED
@@ -1,257 +0,0 @@
1
- import pandas as pd
2
- import os
3
- from pptx import Presentation
4
- from docx import Document
5
- import pymupdf4llm
6
- import fitz
7
- import pymupdf
8
- from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter, TextSplitter
9
- import logging
10
- from bs4 import BeautifulSoup
11
- from langchain_text_splitters import HTMLHeaderTextSplitter
12
- from langchain_text_splitters import RecursiveCharacterTextSplitter
13
- import re
14
- import nltk
15
- import pytesseract
16
- from PIL import Image
17
- from docx import Document as DocxDocument
18
- from langchain_community.document_loaders import (
19
- UnstructuredExcelLoader,
20
- TextLoader,
21
- CSVLoader,
22
- PyMuPDFLoader
23
- )
24
- chunk_size = 4000
25
- chunk_overlap =150
26
- nltk.download('punkt_tab')
27
- nltk.download('averaged_perceptron_tagger_eng')
28
-
29
- # Configure Tesseract path
30
- pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
31
-
32
- def chunk(data):
33
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=150)
34
- splits = text_splitter.split_documents(data)
35
- return splits
36
-
37
- def chunks(data):
38
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=150)
39
- splits = text_splitter.create_documents(data)
40
- return splits
41
-
42
- def chunk_text(text, chunk_size=4000):
43
- return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
44
-
45
-
46
- # Function to extract text from an image
47
- def extract_text_from_image(image_path):
48
- image = Image.open(image_path)
49
- text = pytesseract.image_to_string(image)
50
- return text
51
-
52
-
53
- def get_file_byte_string(blob_service_client, container_name, blob_name):
54
-
55
- logging.info("Initiating Container Client")
56
- container_client = blob_service_client.get_container_client(container_name)
57
-
58
- logging.info("Initiating Blob Client")
59
- blob_client = container_client.get_blob_client(blob_name)
60
-
61
- BLOB_FILE_EXIST = blob_client.exists()
62
- logging.info(f"Blob file exists: {BLOB_FILE_EXIST}")
63
-
64
- if BLOB_FILE_EXIST:
65
- logging.info("Downloading Blob")
66
- download_stream = blob_client.download_blob()
67
- FILE_CONTENT = download_stream.readall()
68
- logging.info("Blob Downloaded")
69
- return FILE_CONTENT
70
- return None
71
- def get_chunk_data(path):
72
- # pdf
73
- if path.endswith(".pdf"):
74
-
75
- pdf_doc = fitz.open(path) # Changed to directly use fitz from PyMuPDF
76
- markdown_data = pymupdf4llm.to_markdown(pdf_doc)
77
-
78
- headers_to_split_on = [
79
- ("#", "Header 1"),
80
- ("##", "Header 2"),
81
- ("###", "Header 3"),
82
- ]
83
- chunk_size =4000
84
- chunk_overlap =150
85
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
86
- text_splitter = RecursiveCharacterTextSplitter(
87
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
88
- )
89
-
90
- markdown_splits = markdown_splitter.split_text(markdown_data)
91
- text_splits = text_splitter.split_documents(markdown_splits)
92
- chunks = [ts.page_content for ts in text_splits]
93
- return chunks
94
-
95
- # Ppptx
96
- elif path.endswith(".pptx"):
97
- presentation = Presentation(path)
98
- slide_data = []
99
- for slide in presentation.slides:
100
- # Access slide content, shapes, and text
101
- slide_text = []
102
- for shape in slide.shapes:
103
- if shape.has_text_frame:
104
- slide_text.append(shape.text)
105
- slide_text = "\n".join(slide_text)
106
- if slide_text:
107
- # print("slide_text", slide_text)
108
- slide_data.append(slide_text)
109
- return slide_data
110
-
111
- # .docx
112
- elif path.endswith((".doc", ".docx")):
113
-
114
- print("path",path)
115
- doc = Document(path)
116
- doc_text = []
117
- for para in doc.paragraphs:
118
- doc_text.append(para.text)
119
-
120
- doc_text = "\n".join(doc_text)
121
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=150)
122
- text_splits = text_splitter.split_text(doc_text)
123
- return text_splits
124
-
125
- # Excel Files
126
- elif path.endswith(".xlsx"):
127
- xl_data = []
128
- loader = UnstructuredExcelLoader(path)
129
- data = loader.load()
130
- data = chunk(data)
131
- for x in data:
132
- xl_data.append(x.page_content)
133
- return xl_data
134
-
135
- # CSV Files
136
- elif path.endswith(".csv"):
137
- csv_data = []
138
- loader = CSVLoader(path)
139
- data = loader.load()
140
- datas = chunk(data)
141
- for x in datas:
142
- csv_data.append(x.page_content)
143
- return csv_data
144
-
145
- # TXT Files
146
- elif path.endswith(".txt"):
147
- txt_data =[]
148
- loader = TextLoader(path)
149
- data = loader.load()
150
- data = chunk(data)
151
- for x in data:
152
- txt_data.append(x.page_content)
153
- return txt_data
154
-
155
- # Image Files
156
- elif path.endswith((".png", ".jpg", ".jpeg")):
157
- text = extract_text_from_image(path) # n
158
- ap_text=""
159
- for x in text:
160
- ap_text+=x
161
- text = chunk_text(text,chunk_size = 4000)
162
- print("path",path)
163
- print('text",text')
164
- return text
165
- # html
166
- elif path.endswith(".html"):
167
-
168
- def convert_table_contents_to_json(table):
169
- # Extract table headers
170
- headers = [header.text for header in table.find_all('th')]
171
-
172
- # Extract table rows
173
- rows = table.find_all('tr')
174
-
175
- # Initialize an empty list to store table data
176
- table_data = []
177
-
178
- # Loop through rows and extract data
179
- for row in rows:
180
- cells = row.find_all('td')
181
- if cells:
182
- row_data = {headers[i]: cell.text for i, cell in enumerate(cells)}
183
- table_data.append(row_data)
184
-
185
- json_data = json.dumps(table_data, indent=1)
186
- return json_data
187
-
188
- def extract_and_replace_tables(bs_object):
189
- """Extracts tables from bs_object and replaces them with comma seperated values.
190
- Returns a beasuiful soup object."""
191
-
192
- tables = bs_object.find_all('table')
193
- for table in tables:
194
- # csv_rows = convert_table_contents_to_csv(table)
195
- csv_rows = convert_table_contents_to_json(table)
196
- # print(csv_rows)
197
- table.replace_with(BeautifulSoup(csv_rows, 'html.parser'))
198
-
199
- return bs_object
200
-
201
- def extract_and_replace_a_tags(bs_object):
202
- a_tags = bs_object.find_all('a')
203
- # replace a tags with their text contents ans url i.e content (url)
204
- for a_tag in a_tags:
205
- if a_tag.text and a_tag.get('href'):
206
- url = a_tag.get('href')
207
- # if not url.startswith('http'):
208
- # base_url = main_url.split('//')[0] + '//' + main_url.split('//')[1].split('/')[0]
209
- # url = base_url + url
210
- a_tag.replace_with(a_tag.text + ' (' + url + ')')
211
- return bs_object
212
-
213
- headers_to_split_on = [
214
- ("h1", "H 1"),
215
- ("h2", "H 2"),
216
- ("h3", "H 3"),
217
- ]
218
-
219
- html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
220
- # def get_html_chunks(html_bytes):
221
- soup = BeautifulSoup(path, 'html.parser')
222
- html_string = extract_and_replace_a_tags(soup)
223
- html_string = extract_and_replace_tables(soup)
224
- html_string = str(html_string)
225
- html_header_splits = html_splitter.split_text(html_string)
226
- html_chunks = []
227
- for html_split in html_header_splits:
228
- chunk_size = 4000
229
- chunk_overlap = 300
230
- text_splitter = RecursiveCharacterTextSplitter(
231
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
232
- )
233
- splits = text_splitter.split_text(html_split.page_content)
234
- for split in splits:
235
- # attach metadat to each split
236
- html_chunks.append(str(html_split.metadata) + " " + split)
237
- return html_chunks
238
- # Excel Files
239
-
240
- elif path.endswith(".xlsx"):
241
- xl_data = []
242
- loader = UnstructuredExcelLoader(path)
243
- data = loader.load()
244
- data = chunk(data)
245
- for x in data:
246
- xl_data.append(x.page_content)
247
- return xl_data
248
-
249
- def get_direct_chunks(data):
250
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=150)
251
- text_splits = text_splitter.split_text(data)
252
- return text_splits
253
-
254
- def get_clean_id(text):
255
- # replace anything with _ other than alphanumeric characters
256
- text = re.sub(r"[^a-zA-Z0-9]+", '_', text)
257
- return text