akshansh36 commited on
Commit
2fcb36b
·
verified ·
1 Parent(s): b9500c0

Upload 4 files

Browse files
helper/process_image.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import httpx
3
+ from pymongo import MongoClient
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from langchain_core.messages import HumanMessage
6
+ import os
7
+ import re
8
+ import json
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+ MONGO_URI = os.getenv("MONGO_URI")
12
+ DB_NAME = os.getenv("DB_NAME")
13
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
14
+ FLASH_API = os.getenv("FLASH_API")
15
+ mongo_client = MongoClient(MONGO_URI)
16
+ db = mongo_client[DB_NAME]
17
+ collection = db[COLLECTION_NAME]
18
+ collection2=db['about_company']
19
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
20
+
21
+ about_company_doc=collection2.find_one({"type":"about_company"})
22
+ if about_company_doc:
23
+ about_company=about_company_doc.get('company_description','')
24
+
25
+ system_prompt_text = f"""Given is an image related to a company. Your task is to analyze the image, identify any text or notable visual elements, and provide a comprehensive, direct description of the image's contents, focusing on what it represents without abstract language or additional commentary. The response must be concise and focused, using only descriptive nouns and adjectives. If the image cannot be clearly described, respond with 'None.'
26
+ Company information is given below to understand the context.
27
+ - About Company: {about_company}
28
+ Ensure you give valid JSON structure.
29
+ Expected Output format : {{"description":"String"}}
30
+
31
+ """
32
+
33
+
34
+
35
+
36
+ def process_image_using_llm(image_url):
37
+ try:
38
+ # Download and encode the image
39
+ image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")
40
+
41
+ # Create the message with a system prompt and image
42
+ message = HumanMessage(
43
+ content=[
44
+ {"type": "text", "text": system_prompt_text},
45
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
46
+ ],
47
+ )
48
+
49
+ # Invoke the model to extract information from the image
50
+ response = model.invoke([message])
51
+ print(f"llm response for {image_url} is : {response}")
52
+ # Use regex to extract JSON part of the response
53
+ match = re.search(r"\{.*\}", response.content.strip())
54
+ if match:
55
+ json_data = match.group(0) # Extract JSON-like content as a string
56
+ data = json.loads(json_data) # Load as JSON
57
+
58
+ # Get the description from the JSON data
59
+ description = data.get("description", "None").strip()
60
+
61
+ # Check if the description is "None"
62
+ if description == "None":
63
+ # Update MongoDB with can_find_description as False
64
+ collection.update_one(
65
+ {"object_url": image_url},
66
+ {"$set": {"can_find_description": False}}
67
+ )
68
+ print(f"Marked {image_url} as can_find_description: False")
69
+ return False
70
+ else:
71
+ # Update MongoDB with the description and set can_find_description to True
72
+ collection.update_one(
73
+ {"object_url": image_url},
74
+ {"$set": {"description": description, "can_find_description": True}}
75
+ )
76
+ print("Saved description to MongoDB")
77
+ return True
78
+ else:
79
+ print(f"No valid JSON found in the response for {image_url}. Marking as can_find_description: False")
80
+ collection.update_one(
81
+ {"object_url": image_url},
82
+ {"$set": {"can_find_description": False}}
83
+ )
84
+
85
+ return False
86
+ except Exception as e:
87
+ print(f"Error processing {image_url}: {e}")
88
+ return False
89
+
90
+
91
+
helper/process_pdf.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ import requests
3
+ import fitz
4
+ from pymongo import MongoClient
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ from langchain_experimental.text_splitter import SemanticChunker
7
+ import os
8
+ import re
9
+ import json
10
+ import uuid
11
+ from dotenv import load_dotenv
12
+ import pinecone
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+ MONGO_URI = os.getenv("MONGO_URI")
17
+ DB_NAME = os.getenv("DB_NAME")
18
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
19
+ FLASH_API = os.getenv("FLASH_API")
20
+ PINECONE_API = os.getenv("PINECONE_API")
21
+ PINECONE_INDEX = os.getenv("PINECONE_INDEX")
22
+
23
+ # Initialize services
24
+ mongo_client = MongoClient(MONGO_URI)
25
+ db = mongo_client[DB_NAME]
26
+ collection = db[COLLECTION_NAME]
27
+
28
+ embed_model = GoogleGenerativeAIEmbeddings(
29
+ model="models/embedding-001", # Correct model name
30
+ google_api_key=FLASH_API # Your API key
31
+ )
32
+ pc = pinecone.Pinecone(
33
+ api_key=PINECONE_API # Your Pinecone API key
34
+ )
35
+ index = pc.Index(PINECONE_INDEX)
36
+
37
+ # Directories for temp files
38
+ pdf_temp_dir = 'temp/pdf_files'
39
+ os.makedirs(pdf_temp_dir, exist_ok=True)
40
+ pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
41
+
42
+
43
+ def download_pdf(url):
44
+ try:
45
+ response = requests.get(url)
46
+ with open(pdf_path, 'wb') as pdf_file:
47
+ pdf_file.write(response.content)
48
+ return pdf_path
49
+ except Exception as e:
50
+ print(f"Error downloading PDF: {e}")
51
+ return None
52
+
53
+
54
+ def semantic_chunking_parallel(pages, url):
55
+ """
56
+ Perform semantic chunking for pages in parallel and store chunks in MongoDB.
57
+ Each chunk is assigned a unique chunk_id.
58
+ """
59
+ semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")
60
+
61
+ def process_page(page):
62
+ try:
63
+ page_content = page["page_content"]
64
+ page_number = page["page_number"]
65
+ page_chunks = semantic_chunker.create_documents([page_content])
66
+
67
+ enriched_chunks = []
68
+ for chunk in page_chunks:
69
+ chunk_id = str(uuid.uuid4()) # Generate a unique ID for the chunk
70
+ enriched_chunks.append({
71
+ "chunk_id": chunk_id,
72
+ "chunk": chunk.page_content,
73
+ "page_number": page_number
74
+ })
75
+ return enriched_chunks
76
+ except Exception as e:
77
+ print(f"Error processing page {page['page_number']}: {e}")
78
+ return []
79
+
80
+ all_chunks = []
81
+ with ThreadPoolExecutor(max_workers=8) as executor:
82
+ futures = {executor.submit(process_page, page): page for page in pages}
83
+ for future in as_completed(futures):
84
+ all_chunks.extend(future.result())
85
+
86
+ # Insert chunks into MongoDB
87
+ for chunk in all_chunks:
88
+ collection.update_one(
89
+ {"object_url": url},
90
+ {"$addToSet": {"chunks": chunk}},
91
+ upsert=True
92
+ )
93
+
94
+ return all_chunks
95
+
96
+ def create_embedding_parallel(url, tags, categories):
97
+ """
98
+ Create embeddings for chunks retrieved from MongoDB and insert them into Pinecone.
99
+ """
100
+ document = collection.find_one({"object_url": url})
101
+ mongo_id = str(document.get('_id'))
102
+ if not document or "chunks" not in document:
103
+ print(f"No chunks found for URL: {url}")
104
+ return 0
105
+
106
+ chunks = document["chunks"]
107
+
108
+ def process_chunk(chunk):
109
+ try:
110
+ chunk_id = chunk["chunk_id"]
111
+ description = chunk["chunk"]
112
+ page_number = chunk["page_number"]
113
+
114
+ embedding = embed_model.embed_query(description)
115
+ pinecone_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, description))
116
+
117
+ # Insert into Pinecone
118
+ vector = {
119
+ 'id': pinecone_id,
120
+ 'values': embedding,
121
+ 'metadata': {
122
+ 'description': description,
123
+ "url": url,
124
+ "page_number": page_number,
125
+ "tags": ','.join(tags),
126
+ "categories": ','.join(categories),
127
+ "filetype":"PDF",
128
+ "mongo_id": mongo_id,
129
+ }
130
+ }
131
+ index.upsert([vector])
132
+ print(f"Inserted chunk {chunk_id} from page {page_number} into Pinecone.")
133
+
134
+ # Update MongoDB with Pinecone ID
135
+ collection.update_one(
136
+ {"object_url": url, "chunks.chunk_id": chunk_id},
137
+ {
138
+ "$set": {
139
+ "chunks.$.pinecone_id": pinecone_id,
140
+ "chunks.$.successfully_embedding_created": True
141
+ }
142
+ }
143
+ )
144
+ return True
145
+ except Exception as e:
146
+ print(f"Error creating embedding for chunk {chunk['chunk_id']}: {e}")
147
+ return False
148
+
149
+ failed_chunks = 0
150
+ with ThreadPoolExecutor(max_workers=8) as executor:
151
+ futures = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
152
+ for future in as_completed(futures):
153
+ if not future.result():
154
+ failed_chunks += 1
155
+ return failed_chunks
156
+
157
+
158
+
159
+ def process_pdf(url, tags, categories):
160
+ """
161
+ Process a PDF: download, chunk, and create embeddings.
162
+ """
163
+ print(f"Processing PDF with URL: {url}")
164
+ if download_pdf(url):
165
+ all_pages, total_pages = extract_text_from_pdf(pdf_path)
166
+ print(f"Total pages: {total_pages}")
167
+
168
+ print("Performing page-level semantic chunking in parallel...")
169
+ semantic_chunks_with_pages = semantic_chunking_parallel(all_pages, url)
170
+ print(f"Total chunks inserted into MongoDB: {len(semantic_chunks_with_pages)}")
171
+
172
+ print("Creating embeddings and inserting into Pinecone in parallel...")
173
+ failed_chunks = create_embedding_parallel(url, tags, categories)
174
+ print(f"Total chunks successfully embedded: {len(semantic_chunks_with_pages) - failed_chunks}")
175
+ print(f"Total chunks failed: {failed_chunks}")
176
+
177
+ return failed_chunks < len(semantic_chunks_with_pages)
178
+
179
+
180
+
181
+ def extract_text_from_pdf(pdf_path):
182
+ """
183
+ Extract text from each page of a PDF using fitz.
184
+ """
185
+ pdf_document = fitz.open(pdf_path)
186
+ try:
187
+ all_pages = []
188
+ total_pages = len(pdf_document)
189
+ for page_num in range(total_pages):
190
+ page = pdf_document[page_num]
191
+ text = page.get_text()
192
+ all_pages.append({"page_content": text, "page_number": page_num + 1})
193
+ finally:
194
+ pdf_document.close()
195
+ return all_pages, total_pages
helper/process_pdf_parallel.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import requests
3
+ from pdf2image import convert_from_path
4
+ import base64
5
+ from pymongo import MongoClient
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
8
+ from langchain_core.messages import HumanMessage
9
+ import os
10
+ import re
11
+ import json
12
+ import uuid
13
+ from dotenv import load_dotenv
14
+ import pinecone
15
+
16
+ load_dotenv()
17
+ MONGO_URI = os.getenv("MONGO_URI")
18
+ DB_NAME = os.getenv("DB_NAME")
19
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
20
+ FLASH_API = os.getenv("FLASH_API")
21
+ PINECONE_API=os.getenv("PINECONE_API")
22
+ PINECONE_INDEX=os.getenv("PINECONE_INDEX")
23
+
24
+ mongo_client = MongoClient(MONGO_URI)
25
+ db = mongo_client[DB_NAME]
26
+ collection = db[COLLECTION_NAME]
27
+ collection2=db['about_company']
28
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
29
+ google_embeddings = GoogleGenerativeAIEmbeddings(
30
+ model="models/embedding-001", # Correct model name
31
+ google_api_key=FLASH_API # Your API key
32
+ )
33
+ pc = pinecone.Pinecone(
34
+ api_key=PINECONE_API # Your Pinecone API key
35
+ )
36
+
37
+ index = pc.Index(PINECONE_INDEX)
38
+
39
+ about_company_doc=collection2.find_one({"type":"about_company"})
40
+ if about_company_doc:
41
+ about_company=about_company_doc.get('company_description','')
42
+
43
+ pdf_temp_dir = 'temp/pdf_files'
44
+ image_temp_dir = 'temp/page_images'
45
+
46
+ os.makedirs(pdf_temp_dir, exist_ok=True)
47
+ os.makedirs(image_temp_dir, exist_ok=True)
48
+
49
+ pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
50
+
51
+ def download_and_split_pdf_to_image(url):
52
+ try:
53
+ response = requests.get(url)
54
+ with open(pdf_path, 'wb') as pdf_file:
55
+ pdf_file.write(response.content)
56
+
57
+
58
+ except Exception as e:
59
+ print(f"error occured during downloading pdf from object url : {e}")
60
+ return None
61
+
62
+ try:
63
+ images = convert_from_path(pdf_path)
64
+ for i, image in enumerate(images):
65
+ image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
66
+ image.save(image_path, 'PNG')
67
+ print(f'Saved image: {image_path}')
68
+ return True
69
+
70
+ except Exception as e:
71
+ print(f"error occured in converting pdf pages to image : {e}")
72
+ return None
73
+
74
+
75
+
76
+ system_prompt_text = f"""Given is an image of a PDF page.Your task is to extract all the information from this image and give a detailed summary of the page, do not miss out on any information, include keywords or any terms mentioned in the pdf.'
77
+ Given below is a company information whose pdf page is givn to you, to understand the context.
78
+ - About Company: {about_company}
79
+ Follow this Expected output format given below:
80
+ Expected Output format : {{"description":"String"}}
81
+
82
+ """
83
+
84
+ def process_image_using_llm(image, page_number, url):
85
+ try:
86
+ message = HumanMessage(
87
+ content=[
88
+ {"type": "text", "text": system_prompt_text},
89
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
90
+ ],
91
+ )
92
+ response = model.invoke([message])
93
+ print(f"LLM response for page {page_number}: {response}")
94
+
95
+ # Extract JSON from the response content using regex
96
+ match = re.search(r"\{.*\}", response.content.strip())
97
+ if match:
98
+ json_data = match.group(0)
99
+
100
+ # Step 1: Locate the "description" field and escape all single quotes within it
101
+ description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
102
+
103
+ if description_match:
104
+ description_text = description_match.group(1)
105
+
106
+ # Replace outer single quotes with double quotes if necessary
107
+ if description_text.startswith("'") and description_text.endswith("'"):
108
+ description_text = f'"{description_text[1:-1]}"'
109
+ elif description_text.startswith('"') and description_text.endswith('"'):
110
+ pass # No change needed if already double quotes
111
+
112
+ # Escape all single quotes within the description text
113
+ description_text = description_text.replace("'", "\\'")
114
+
115
+ # Replace the original match with the updated description text
116
+ json_data = (
117
+ json_data[:description_match.start(1)] +
118
+ description_text +
119
+ json_data[description_match.end(1):]
120
+ )
121
+
122
+ # Step 2: Attempt to load the cleaned JSON string
123
+ try:
124
+ data = json.loads(json_data) # Load as JSON
125
+ description = data.get("description", "None").strip()
126
+ can_find_description = description != "None"
127
+
128
+ return {
129
+ "page_number": page_number,
130
+ "description": description if can_find_description else None,
131
+ "can_find_description": can_find_description
132
+ }
133
+ except json.JSONDecodeError as e:
134
+ print(f"Error decoding JSON for page {page_number}: {e}")
135
+ return {
136
+ "page_number": page_number,
137
+ "description": None,
138
+ "can_find_description": False
139
+ }
140
+ else:
141
+ print(f"No valid JSON found in the response for page {page_number}")
142
+ return {
143
+ "page_number": page_number,
144
+ "description": None,
145
+ "can_find_description": False
146
+ }
147
+
148
+ except Exception as e:
149
+ print(f"Error processing page {page_number}: {e}")
150
+ return {
151
+ "page_number": page_number,
152
+ "description": None,
153
+ "can_find_description": False
154
+ }
155
+
156
+ def create_embedding_for_pdf_chunks(page,description,url,tags,categories):
157
+ try:
158
+ document = collection.find_one({'object_url': url})
159
+ file_type = document.get("type")
160
+ mongo_id = str(document.get('_id'))
161
+ embedding = google_embeddings.embed_query(description)
162
+ pinecone_id = str(uuid.uuid4())
163
+
164
+ vectors = [{
165
+ 'id': pinecone_id,
166
+ 'values': embedding,
167
+ 'metadata': {
168
+ 'description': description,
169
+ "url": url,
170
+ "page_number":page,
171
+ "tag": file_type,
172
+ "mongo_id": mongo_id,
173
+ "tags": ','.join(tags),
174
+ "categories": ','.join(categories) # Store MongoDB ID in metadata
175
+ }
176
+ }]
177
+ index.upsert(vectors)
178
+ print(f"Inserted: page {page} in Pinecone with MongoDB ID {mongo_id} in metadata")
179
+
180
+ collection.update_one(
181
+ {
182
+ "_id": document["_id"],
183
+ "chunks.page_number": page # Match document and specific chunk by page number
184
+ },
185
+ {
186
+ "$set": {
187
+ "chunks.$.pinecone_id": pinecone_id,
188
+ "chunks.$.successfully_embedding_created": True
189
+ }
190
+ }
191
+ )
192
+ return True
193
+
194
+ except Exception as e:
195
+ print(f"error occured in creating embedding for pdf with mongo id {mongo_id} for page {page}")
196
+ collection.update_one(
197
+ {
198
+ "_id": document["_id"],
199
+ "chunks.page_number": page # Match document and specific chunk by page number
200
+ },
201
+ {
202
+ "$set": {
203
+ "chunks.$.successfully_embedding_created": False
204
+ }
205
+ }
206
+ )
207
+ return False
208
+
209
+
210
+ def process_image_and_create_embedding(page_number, image_path, url, tags, categories):
211
+ with open(image_path, "rb") as image_file:
212
+ image_data = base64.b64encode(image_file.read()).decode("utf-8")
213
+
214
+ # Process image using LLM to get description
215
+ page_result = process_image_using_llm(image_data, page_number, url)
216
+
217
+ # If description is available, create embedding
218
+ if page_result.get("description"):
219
+ create_embedding_for_pdf_chunks(page_number, page_result["description"], url, tags, categories)
220
+ else:
221
+ print(f"Skipping page {page_number} as description is None")
222
+
223
+ return page_result
224
+
225
+
226
+ def cleanup_directory(directory_path):
227
+ try:
228
+ for filename in os.listdir(directory_path):
229
+ file_path = os.path.join(directory_path, filename)
230
+ if os.path.isfile(file_path):
231
+ os.remove(file_path)
232
+ print(f"Cleaned up files in {directory_path}")
233
+ except Exception as e:
234
+ print(f"Error cleaning up directory {directory_path}: {e}")
235
+
236
+
237
+ def process_pdf(url, tags, categories):
238
+ print(f"Processing PDF with URL: {url}")
239
+ if download_and_split_pdf_to_image(url):
240
+ chunks = []
241
+ image_files = sorted(
242
+ os.listdir(image_temp_dir),
243
+ key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
244
+ )
245
+
246
+ # Use ThreadPoolExecutor to process each page in parallel
247
+ with concurrent.futures.ThreadPoolExecutor() as executor:
248
+ futures = [
249
+ executor.submit(
250
+ process_image_and_create_embedding,
251
+ count,
252
+ os.path.join(image_temp_dir, image_name),
253
+ url,
254
+ tags,
255
+ categories
256
+ )
257
+ for count, image_name in enumerate(image_files, start=1)
258
+ ]
259
+
260
+ # Collect results as each thread completes
261
+ for future in concurrent.futures.as_completed(futures):
262
+ try:
263
+ page_result = future.result()
264
+ chunks.append(page_result)
265
+ except Exception as e:
266
+ print(f"Error processing page: {e}")
267
+
268
+ # Update MongoDB document with the collected chunks
269
+ collection.update_one(
270
+ {"object_url": url},
271
+ {"$set": {"chunks": chunks}},
272
+ upsert=True
273
+ )
274
+ print("Saved chunks to MongoDB.")
275
+
276
+ # Cleanup directories
277
+ cleanup_directory(pdf_temp_dir)
278
+ cleanup_directory(image_temp_dir)
279
+
280
+ # Check how many pages failed to create embeddings
281
+ total_pages = len(chunks)
282
+ failed_pages = sum(1 for chunk in chunks if not chunk.get("can_find_description"))
283
+ return failed_pages < total_pages
helper/upload_file_to_s3.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from datetime import datetime
3
+ import boto3
4
+ import uuid
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
11
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
12
+ AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
13
+ MONGO_URI = os.getenv("MONGO_URI")
14
+ DB_NAME = os.getenv("DB_NAME")
15
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
16
+
17
+ mongo_client = MongoClient(MONGO_URI)
18
+ db = mongo_client[DB_NAME]
19
+ collection = db[COLLECTION_NAME]
20
+ s3 = boto3.client(
21
+ 's3',
22
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
23
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
24
+ )
25
+ def upload_file(file,filetype):
26
+ try:
27
+ # Generate a unique key for the file using UUID
28
+ uuid_str = str(uuid.uuid4())
29
+ file_name = file.name
30
+ s3_key = f'MoSPI_files/{uuid_str}-{file_name}'
31
+
32
+ # Upload the image to S3 with ContentType for image files
33
+ s3.upload_fileobj(
34
+ file,
35
+ AWS_BUCKET_NAME,
36
+ s3_key,
37
+ ExtraArgs={'ContentType': file.type} # Set the MIME type of the uploaded file
38
+ )
39
+
40
+ file_size = file.size
41
+ upload_time = datetime.now()
42
+
43
+ # Extract date and time separately
44
+ upload_date = upload_time.strftime('%Y-%m-%d')
45
+ upload_time_only = upload_time.strftime('%H:%M:%S')
46
+
47
+ # Metadata to MongoDB
48
+ metadata = {
49
+ 'name': file_name,
50
+ 'size': file_size,
51
+ 'type': filetype,
52
+ 'status': 'unprocessed',
53
+ 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
54
+ 's3_key': s3_key,
55
+ 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
56
+ 'date_uploaded': upload_date,
57
+ 'time_uploaded': upload_time_only,
58
+ 'accuracy': None
59
+ }
60
+
61
+ # Insert metadata into MongoDB
62
+ collection.insert_one(metadata)
63
+ return metadata
64
+
65
+ except Exception as e:
66
+ print(f"An error occurred during upload: {e}")
67
+ return None