Prathmesh48 commited on
Commit
0159657
·
verified ·
1 Parent(s): bcb7c1a

Update embedding.py

Browse files
Files changed (1) hide show
  1. embedding.py +251 -251
embedding.py CHANGED
@@ -1,251 +1,251 @@
1
- import requests
2
- import json
3
- import os
4
- import concurrent.futures
5
- import random
6
- from langchain_google_genai import ChatGoogleGenerativeAI
7
- from langchain_community.document_loaders import WebBaseLoader
8
- from langchain_community.document_loaders import PyPDFLoader
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- import google.generativeai as genai
11
-
12
-
13
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
14
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
15
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
16
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
17
-
18
- genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
19
-
20
-
21
- def pdf_extractor(link):
22
- text = ''
23
-
24
- try:
25
- loader = PyPDFLoader(link)
26
- pages = loader.load_and_split()
27
-
28
- for page in pages:
29
- text+=page.page_content
30
- except:
31
- pass
32
-
33
- return [text]
34
-
35
- def web_extractor(link):
36
- text = ''
37
-
38
- try:
39
- loader = WebBaseLoader(link)
40
- pages = loader.load_and_split()
41
-
42
- for page in pages:
43
- text+=page.page_content
44
- except:
45
- pass
46
-
47
- return [text]
48
-
49
-
50
- def feature_extraction(tag, history , context):
51
-
52
- prompt = f'''
53
- You are an intelligent assistant tasked with updating product information. You have two data sources:
54
- 1. Tag_History: Previously gathered information about the product.
55
- 2. Tag_Context: New data that might contain additional details.
56
-
57
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
58
-
59
- Guidelines:
60
- - Only add new details that are relevant to the {tag} FIELD.
61
- - Do not add or modify any other fields in the Tag_History.
62
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
63
-
64
- Here is the data:
65
-
66
- Tag_Context: {str(context)}
67
- Tag_History: {history}
68
-
69
- Respond with the updated Tag_History.
70
- '''
71
-
72
- model = random.choice([gemini,gemini1])
73
- result = model.invoke(prompt)
74
-
75
- return result.content
76
-
77
- def detailed_feature_extraction(find, context):
78
-
79
- prompt = f'''
80
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
81
- 1. Context: The gathered information about the product.
82
- 2. Format: Details which need to be filled based on Context.
83
-
84
- Your job is to read the Context and update the relevant field in Format using Context.
85
-
86
- Guidelines:
87
- - Only add details that are relevant to the individual FIELD.
88
- - Do not add or modify any other fields in the Format.
89
- - If nothing found return None.
90
-
91
- Here is the data:
92
-
93
- The Context is {str(context)}
94
- The Format is {str(find)}
95
- '''
96
-
97
- model = random.choice([gemini,gemini1,gemini2,gemini3])
98
- result = model.invoke(prompt)
99
-
100
- return result.content
101
-
102
- def detailed_history(history):
103
-
104
- details = {
105
- "Introduction": {
106
- "Product Name": None,
107
- "Overview of the product": None,
108
- "Purpose of the manual": None,
109
- "Audience": None,
110
- "Additional Details": None
111
- },
112
- "Specifications": {
113
- "Technical specifications": None,
114
- "Performance metrics": None,
115
- "Additional Details": None
116
- },
117
- "Product Overview": {
118
- "Product features": None,
119
- "Key components and parts": None,
120
- "Additional Details": None
121
- },
122
- "Safety Information": {
123
- "Safety warnings and precautions": None,
124
- "Compliance and certification information": None,
125
- "Additional Details": None
126
- },
127
- "Installation Instructions": {
128
- "Unboxing and inventory checklist": None,
129
- "Step-by-step installation guide": None,
130
- "Required tools and materials": None,
131
- "Additional Details": None
132
- },
133
- "Setup and Configuration": {
134
- "Initial setup procedures": None,
135
- "Configuration settings": None,
136
- "Troubleshooting setup issues": None,
137
- "Additional Details": None
138
- },
139
- "Operation Instructions": {
140
- "How to use the product": None,
141
- "Detailed instructions for different functionalities": None,
142
- "User interface guide": None,
143
- "Additional Details": None
144
- },
145
- "Maintenance and Care": {
146
- "Cleaning instructions": None,
147
- "Maintenance schedule": None,
148
- "Replacement parts and accessories": None,
149
- "Additional Details": None
150
- },
151
- "Troubleshooting": {
152
- "Common issues and solutions": None,
153
- "Error messages and their meanings": None,
154
- "Support Information": None,
155
- "Additional Details": None
156
- },
157
- "Warranty Information": {
158
- "Terms and Conditions": None,
159
- "Service and repair information": None,
160
- "Additional Details": None
161
- },
162
- "Legal Information": {
163
- "Copyright information": None,
164
- "Trademarks and patents": None,
165
- "Disclaimers": None,
166
- "Additional Details": None
167
-
168
- }
169
- }
170
-
171
- for key,val in history.items():
172
-
173
- find = details[key]
174
-
175
- details[key] = str(detailed_feature_extraction(find,val))
176
-
177
- return details
178
-
179
-
180
- def get_embeddings(link):
181
-
182
- print(f"\nCreating Embeddings ----- {link}")
183
- history = {
184
- "Introduction": "",
185
- "Specifications": "",
186
- "Product Overview": "",
187
- "Safety Information": "",
188
- "Installation Instructions": "",
189
- "Setup and Configuration": "",
190
- "Operation Instructions": "",
191
- "Maintenance and Care": "",
192
- "Troubleshooting": "",
193
- "Warranty Information": "",
194
- "Legal Information": ""
195
- }
196
-
197
- # Extract Text -----------------------------
198
- print("Extracting Text")
199
- if link[-3:] == '.md' or link[8:11] == 'en.':
200
- text = web_extractor(link)
201
- else:
202
- text = pdf_extractor(link)
203
-
204
- # Create Chunks ----------------------------
205
- print("Writing Tag Data")
206
- chunks = text_splitter.create_documents(text)
207
-
208
- for chunk in chunks:
209
-
210
- with concurrent.futures.ThreadPoolExecutor() as executor:
211
- future_to_key = {
212
- executor.submit(
213
- feature_extraction, f"Product {key}", history[key], chunk.page_content
214
- ): key for key in history
215
- }
216
- for future in concurrent.futures.as_completed(future_to_key):
217
- key = future_to_key[future]
218
- try:
219
- response = future.result()
220
- history[key] = response
221
- except Exception as e:
222
- print(f"Error processing {key}: {e}")
223
-
224
- # history = detailed_history(history)
225
- print("Creating Vectors")
226
- genai_embeddings=[]
227
-
228
- for tag in history:
229
- result = genai.embed_content(
230
- model="models/embedding-001",
231
- content=history[tag],
232
- task_type="retrieval_document")
233
- genai_embeddings.append(result['embedding'])
234
-
235
-
236
- return history,genai_embeddings
237
-
238
- global text_splitter
239
- global data
240
- global history
241
-
242
-
243
- text_splitter = RecursiveCharacterTextSplitter(
244
- chunk_size = 10000,
245
- chunk_overlap = 100,
246
- separators = ["",''," "]
247
- )
248
-
249
-
250
- if __name__ == '__main__':
251
- pass
 
1
+ import requests
2
+ import json
3
+ import os
4
+ import concurrent.futures
5
+ import random
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_community.document_loaders import WebBaseLoader
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ import google.generativeai as genai
11
+
12
+
13
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
14
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
15
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
16
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
17
+
18
+ genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
19
+
20
+
21
+ def pdf_extractor(link):
22
+ text = ''
23
+
24
+ try:
25
+ loader = PyPDFLoader(link)
26
+ pages = loader.load_and_split()
27
+
28
+ for page in pages:
29
+ text+=page.page_content
30
+ except:
31
+ pass
32
+
33
+ return [text]
34
+
35
+ def web_extractor(link):
36
+ text = ''
37
+
38
+ try:
39
+ loader = WebBaseLoader(link)
40
+ pages = loader.load_and_split()
41
+
42
+ for page in pages:
43
+ text+=page.page_content
44
+ except:
45
+ pass
46
+
47
+ return [text]
48
+
49
+
50
+ def feature_extraction(tag, history , context):
51
+
52
+ prompt = f'''
53
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
54
+ 1. Tag_History: Previously gathered information about the product.
55
+ 2. Tag_Context: New data that might contain additional details.
56
+
57
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
58
+
59
+ Guidelines:
60
+ - Only add new details that are relevant to the {tag} FIELD.
61
+ - Do not add or modify any other fields in the Tag_History.
62
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
63
+
64
+ Here is the data:
65
+
66
+ Tag_Context: {str(context)}
67
+ Tag_History: {history}
68
+
69
+ Respond with the updated Tag_History.
70
+ '''
71
+
72
+ model = random.choice([gemini,gemini1])
73
+ result = model.invoke(prompt)
74
+
75
+ return result.content
76
+
77
+ def detailed_feature_extraction(find, context):
78
+
79
+ prompt = f'''
80
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
81
+ 1. Context: The gathered information about the product.
82
+ 2. Format: Details which need to be filled based on Context.
83
+
84
+ Your job is to read the Context and update the relevant field in Format using Context.
85
+
86
+ Guidelines:
87
+ - Only add details that are relevant to the individual FIELD.
88
+ - Do not add or modify any other fields in the Format.
89
+ - If nothing found return None.
90
+
91
+ Here is the data:
92
+
93
+ The Context is {str(context)}
94
+ The Format is {str(find)}
95
+ '''
96
+
97
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
98
+ result = model.invoke(prompt)
99
+
100
+ return result.content
101
+
102
+ def detailed_history(history):
103
+
104
+ details = {
105
+ "Introduction": {
106
+ "Product Name": None,
107
+ "Overview of the product": None,
108
+ "Purpose of the manual": None,
109
+ "Audience": None,
110
+ "Additional Details": None
111
+ },
112
+ "Specifications": {
113
+ "Technical specifications": None,
114
+ "Performance metrics": None,
115
+ "Additional Details": None
116
+ },
117
+ "Product Overview": {
118
+ "Product features": None,
119
+ "Key components and parts": None,
120
+ "Additional Details": None
121
+ },
122
+ "Safety Information": {
123
+ "Safety warnings and precautions": None,
124
+ "Compliance and certification information": None,
125
+ "Additional Details": None
126
+ },
127
+ "Installation Instructions": {
128
+ "Unboxing and inventory checklist": None,
129
+ "Step-by-step installation guide": None,
130
+ "Required tools and materials": None,
131
+ "Additional Details": None
132
+ },
133
+ "Setup and Configuration": {
134
+ "Initial setup procedures": None,
135
+ "Configuration settings": None,
136
+ "Troubleshooting setup issues": None,
137
+ "Additional Details": None
138
+ },
139
+ "Operation Instructions": {
140
+ "How to use the product": None,
141
+ "Detailed instructions for different functionalities": None,
142
+ "User interface guide": None,
143
+ "Additional Details": None
144
+ },
145
+ "Maintenance and Care": {
146
+ "Cleaning instructions": None,
147
+ "Maintenance schedule": None,
148
+ "Replacement parts and accessories": None,
149
+ "Additional Details": None
150
+ },
151
+ "Troubleshooting": {
152
+ "Common issues and solutions": None,
153
+ "Error messages and their meanings": None,
154
+ "Support Information": None,
155
+ "Additional Details": None
156
+ },
157
+ "Warranty Information": {
158
+ "Terms and Conditions": None,
159
+ "Service and repair information": None,
160
+ "Additional Details": None
161
+ },
162
+ "Legal Information": {
163
+ "Copyright information": None,
164
+ "Trademarks and patents": None,
165
+ "Disclaimers": None,
166
+ "Additional Details": None
167
+
168
+ }
169
+ }
170
+
171
+ for key,val in history.items():
172
+
173
+ find = details[key]
174
+
175
+ details[key] = str(detailed_feature_extraction(find,val))
176
+
177
+ return details
178
+
179
+
180
+ def get_embeddings(link):
181
+
182
+ print(f"\nCreating Embeddings ----- {link}")
183
+ history = {
184
+ "Introduction": "",
185
+ "Specifications": "",
186
+ "Product Overview": "",
187
+ "Safety Information": "",
188
+ "Installation Instructions": "",
189
+ "Setup and Configuration": "",
190
+ "Operation Instructions": "",
191
+ "Maintenance and Care": "",
192
+ "Troubleshooting": "",
193
+ "Warranty Information": "",
194
+ "Legal Information": ""
195
+ }
196
+
197
+ # Extract Text -----------------------------
198
+ print("Extracting Text")
199
+ if link[-3:] == '.md' or link[8:11] == 'en.':
200
+ text = web_extractor(link)
201
+ else:
202
+ text = pdf_extractor(link)
203
+ print(text)
204
+ # Create Chunks ----------------------------
205
+ print("Writing Tag Data")
206
+ chunks = text_splitter.create_documents(text)
207
+
208
+ for chunk in chunks:
209
+
210
+ with concurrent.futures.ThreadPoolExecutor() as executor:
211
+ future_to_key = {
212
+ executor.submit(
213
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
214
+ ): key for key in history
215
+ }
216
+ for future in concurrent.futures.as_completed(future_to_key):
217
+ key = future_to_key[future]
218
+ try:
219
+ response = future.result()
220
+ history[key] = response
221
+ except Exception as e:
222
+ print(f"Error processing {key}: {e}")
223
+
224
+ # history = detailed_history(history)
225
+ print("Creating Vectors")
226
+ genai_embeddings=[]
227
+
228
+ for tag in history:
229
+ result = genai.embed_content(
230
+ model="models/embedding-001",
231
+ content=history[tag],
232
+ task_type="retrieval_document")
233
+ genai_embeddings.append(result['embedding'])
234
+
235
+
236
+ return history,genai_embeddings
237
+
238
+ global text_splitter
239
+ global data
240
+ global history
241
+
242
+
243
+ text_splitter = RecursiveCharacterTextSplitter(
244
+ chunk_size = 10000,
245
+ chunk_overlap = 100,
246
+ separators = ["",''," "]
247
+ )
248
+
249
+
250
+ if __name__ == '__main__':
251
+ pass