Spaces:

youdata-ai
/

MOSPI2

Sleeping

App Files Files Community

akshansh36 commited on Dec 2, 2024

Commit

2fcb36b

verified ·

1 Parent(s): b9500c0

Upload 4 files

Browse files

Files changed (4) hide show

helper/process_image.py +91 -0
helper/process_pdf.py +195 -0
helper/process_pdf_parallel.py +283 -0
helper/upload_file_to_s3.py +67 -0

helper/process_image.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import base64
+import httpx
+from pymongo import MongoClient
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+import os
+import re
+import json
+from dotenv import load_dotenv
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+FLASH_API = os.getenv("FLASH_API")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+collection2=db['about_company']
+model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
+about_company_doc=collection2.find_one({"type":"about_company"})
+if about_company_doc:
+    about_company=about_company_doc.get('company_description','')
+system_prompt_text = f"""Given is an image related to a company. Your task is to analyze the image, identify any text or notable visual elements, and provide a comprehensive, direct description of the image's contents, focusing on what it represents without abstract language or additional commentary. The response must be concise and focused, using only descriptive nouns and adjectives. If the image cannot be clearly described, respond with 'None.'
+Company information is given below to understand the context.
+- About Company: {about_company}
+Ensure you give valid JSON structure.
+Expected Output format : {{"description":"String"}}
+"""
+def process_image_using_llm(image_url):
+    try:
+        # Download and encode the image
+        image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")
+        # Create the message with a system prompt and image
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": system_prompt_text},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
+            ],
+        )
+        # Invoke the model to extract information from the image
+        response = model.invoke([message])
+        print(f"llm response for {image_url} is : {response}")
+        # Use regex to extract JSON part of the response
+        match = re.search(r"\{.*\}", response.content.strip())
+        if match:
+            json_data = match.group(0)  # Extract JSON-like content as a string
+            data = json.loads(json_data)  # Load as JSON
+            # Get the description from the JSON data
+            description = data.get("description", "None").strip()
+            # Check if the description is "None"
+            if description == "None":
+                # Update MongoDB with can_find_description as False
+                collection.update_one(
+                    {"object_url": image_url},
+                    {"$set": {"can_find_description": False}}
+                )
+                print(f"Marked {image_url} as can_find_description: False")
+                return False
+            else:
+                # Update MongoDB with the description and set can_find_description to True
+                collection.update_one(
+                    {"object_url": image_url},
+                    {"$set": {"description": description, "can_find_description": True}}
+                )
+                print("Saved description to MongoDB")
+                return True
+        else:
+            print(f"No valid JSON found in the response for {image_url}. Marking as can_find_description: False")
+            collection.update_one(
+                {"object_url": image_url},
+                {"$set": {"can_find_description": False}}
+            )
+            return False
+    except Exception as e:
+        print(f"Error processing {image_url}: {e}")
+        return False

helper/process_pdf.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from concurrent.futures import ThreadPoolExecutor, as_completed
+import requests
+import fitz
+from pymongo import MongoClient
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_experimental.text_splitter import SemanticChunker
+import os
+import re
+import json
+import uuid
+from dotenv import load_dotenv
+import pinecone
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+FLASH_API = os.getenv("FLASH_API")
+PINECONE_API = os.getenv("PINECONE_API")
+PINECONE_INDEX = os.getenv("PINECONE_INDEX")
+# Initialize services
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+embed_model = GoogleGenerativeAIEmbeddings(
+    model="models/embedding-001",  # Correct model name
+    google_api_key=FLASH_API  # Your API key
+)
+pc = pinecone.Pinecone(
+    api_key=PINECONE_API  # Your Pinecone API key
+)
+index = pc.Index(PINECONE_INDEX)
+# Directories for temp files
+pdf_temp_dir = 'temp/pdf_files'
+os.makedirs(pdf_temp_dir, exist_ok=True)
+pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
+def download_pdf(url):
+    try:
+        response = requests.get(url)
+        with open(pdf_path, 'wb') as pdf_file:
+            pdf_file.write(response.content)
+        return pdf_path
+    except Exception as e:
+        print(f"Error downloading PDF: {e}")
+        return None
+def semantic_chunking_parallel(pages, url):
+    """
+    Perform semantic chunking for pages in parallel and store chunks in MongoDB.
+    Each chunk is assigned a unique chunk_id.
+    """
+    semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")
+    def process_page(page):
+        try:
+            page_content = page["page_content"]
+            page_number = page["page_number"]
+            page_chunks = semantic_chunker.create_documents([page_content])
+            enriched_chunks = []
+            for chunk in page_chunks:
+                chunk_id = str(uuid.uuid4())  # Generate a unique ID for the chunk
+                enriched_chunks.append({
+                    "chunk_id": chunk_id,
+                    "chunk": chunk.page_content,
+                    "page_number": page_number
+                })
+            return enriched_chunks
+        except Exception as e:
+            print(f"Error processing page {page['page_number']}: {e}")
+            return []
+    all_chunks = []
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = {executor.submit(process_page, page): page for page in pages}
+        for future in as_completed(futures):
+            all_chunks.extend(future.result())
+    # Insert chunks into MongoDB
+    for chunk in all_chunks:
+        collection.update_one(
+            {"object_url": url},
+            {"$addToSet": {"chunks": chunk}},
+            upsert=True
+        )
+    return all_chunks
+def create_embedding_parallel(url, tags, categories):
+    """
+    Create embeddings for chunks retrieved from MongoDB and insert them into Pinecone.
+    """
+    document = collection.find_one({"object_url": url})
+    mongo_id = str(document.get('_id'))
+    if not document or "chunks" not in document:
+        print(f"No chunks found for URL: {url}")
+        return 0
+    chunks = document["chunks"]
+    def process_chunk(chunk):
+        try:
+            chunk_id = chunk["chunk_id"]
+            description = chunk["chunk"]
+            page_number = chunk["page_number"]
+            embedding = embed_model.embed_query(description)
+            pinecone_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, description))
+            # Insert into Pinecone
+            vector = {
+                'id': pinecone_id,
+                'values': embedding,
+                'metadata': {
+                    'description': description,
+                    "url": url,
+                    "page_number": page_number,
+                    "tags": ','.join(tags),
+                    "categories": ','.join(categories),
+                    "filetype":"PDF",
+                    "mongo_id": mongo_id,
+                }
+            }
+            index.upsert([vector])
+            print(f"Inserted chunk {chunk_id} from page {page_number} into Pinecone.")
+            # Update MongoDB with Pinecone ID
+            collection.update_one(
+                {"object_url": url, "chunks.chunk_id": chunk_id},
+                {
+                    "$set": {
+                        "chunks.$.pinecone_id": pinecone_id,
+                        "chunks.$.successfully_embedding_created": True
+                    }
+                }
+            )
+            return True
+        except Exception as e:
+            print(f"Error creating embedding for chunk {chunk['chunk_id']}: {e}")
+            return False
+    failed_chunks = 0
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
+        for future in as_completed(futures):
+            if not future.result():
+                failed_chunks += 1
+    return failed_chunks
+def process_pdf(url, tags, categories):
+    """
+    Process a PDF: download, chunk, and create embeddings.
+    """
+    print(f"Processing PDF with URL: {url}")
+    if download_pdf(url):
+        all_pages, total_pages = extract_text_from_pdf(pdf_path)
+        print(f"Total pages: {total_pages}")
+        print("Performing page-level semantic chunking in parallel...")
+        semantic_chunks_with_pages = semantic_chunking_parallel(all_pages, url)
+        print(f"Total chunks inserted into MongoDB: {len(semantic_chunks_with_pages)}")
+        print("Creating embeddings and inserting into Pinecone in parallel...")
+        failed_chunks = create_embedding_parallel(url, tags, categories)
+        print(f"Total chunks successfully embedded: {len(semantic_chunks_with_pages) - failed_chunks}")
+        print(f"Total chunks failed: {failed_chunks}")
+        return failed_chunks < len(semantic_chunks_with_pages)
+def extract_text_from_pdf(pdf_path):
+    """
+    Extract text from each page of a PDF using fitz.
+    """
+    pdf_document = fitz.open(pdf_path)
+    try:
+        all_pages = []
+        total_pages = len(pdf_document)
+        for page_num in range(total_pages):
+            page = pdf_document[page_num]
+            text = page.get_text()
+            all_pages.append({"page_content": text, "page_number": page_num + 1})
+    finally:
+        pdf_document.close()
+    return all_pages, total_pages

helper/process_pdf_parallel.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import concurrent.futures
+import requests
+from pdf2image import convert_from_path
+import base64
+from pymongo import MongoClient
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_core.messages import HumanMessage
+import os
+import re
+import json
+import uuid
+from dotenv import load_dotenv
+import pinecone
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+FLASH_API = os.getenv("FLASH_API")
+PINECONE_API=os.getenv("PINECONE_API")
+PINECONE_INDEX=os.getenv("PINECONE_INDEX")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+collection2=db['about_company']
+model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
+google_embeddings = GoogleGenerativeAIEmbeddings(
+    model="models/embedding-001",  # Correct model name
+    google_api_key=FLASH_API  # Your API key
+)
+pc = pinecone.Pinecone(
+    api_key=PINECONE_API  # Your Pinecone API key
+)
+index = pc.Index(PINECONE_INDEX)
+about_company_doc=collection2.find_one({"type":"about_company"})
+if about_company_doc:
+    about_company=about_company_doc.get('company_description','')
+pdf_temp_dir = 'temp/pdf_files'
+image_temp_dir = 'temp/page_images'
+os.makedirs(pdf_temp_dir, exist_ok=True)
+os.makedirs(image_temp_dir, exist_ok=True)
+pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
+def download_and_split_pdf_to_image(url):
+    try:
+        response = requests.get(url)
+        with open(pdf_path, 'wb') as pdf_file:
+            pdf_file.write(response.content)
+    except Exception as e:
+        print(f"error occured during downloading pdf from object url : {e}")
+        return None
+    try:
+        images = convert_from_path(pdf_path)
+        for i, image in enumerate(images):
+            image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
+            image.save(image_path, 'PNG')
+            print(f'Saved image: {image_path}')
+        return True
+    except Exception as e:
+        print(f"error occured in converting pdf pages to image : {e}")
+        return None
+system_prompt_text = f"""Given is an image of a PDF page.Your task is to extract all the information from this image and give a detailed summary of the page, do not miss out on any information, include keywords or any terms mentioned in the pdf.'
+Given below is a company information whose pdf page is givn to you,  to understand the context.
+- About Company: {about_company}
+Follow this Expected output format given below:
+Expected Output format : {{"description":"String"}}
+"""
+def process_image_using_llm(image, page_number, url):
+    try:
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": system_prompt_text},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
+            ],
+        )
+        response = model.invoke([message])
+        print(f"LLM response for page {page_number}: {response}")
+        # Extract JSON from the response content using regex
+        match = re.search(r"\{.*\}", response.content.strip())
+        if match:
+            json_data = match.group(0)
+            # Step 1: Locate the "description" field and escape all single quotes within it
+            description_match = re.search(r"'description'\s*:\s*('.*?'|\".*?\")", json_data)
+            if description_match:
+                description_text = description_match.group(1)
+                # Replace outer single quotes with double quotes if necessary
+                if description_text.startswith("'") and description_text.endswith("'"):
+                    description_text = f'"{description_text[1:-1]}"'
+                elif description_text.startswith('"') and description_text.endswith('"'):
+                    pass  # No change needed if already double quotes
+                # Escape all single quotes within the description text
+                description_text = description_text.replace("'", "\\'")
+                # Replace the original match with the updated description text
+                json_data = (
+                    json_data[:description_match.start(1)] +
+                    description_text +
+                    json_data[description_match.end(1):]
+                )
+            # Step 2: Attempt to load the cleaned JSON string
+            try:
+                data = json.loads(json_data)  # Load as JSON
+                description = data.get("description", "None").strip()
+                can_find_description = description != "None"
+                return {
+                    "page_number": page_number,
+                    "description": description if can_find_description else None,
+                    "can_find_description": can_find_description
+                }
+            except json.JSONDecodeError as e:
+                print(f"Error decoding JSON for page {page_number}: {e}")
+                return {
+                    "page_number": page_number,
+                    "description": None,
+                    "can_find_description": False
+                }
+        else:
+            print(f"No valid JSON found in the response for page {page_number}")
+            return {
+                "page_number": page_number,
+                "description": None,
+                "can_find_description": False
+            }
+    except Exception as e:
+        print(f"Error processing page {page_number}: {e}")
+        return {
+            "page_number": page_number,
+            "description": None,
+            "can_find_description": False
+        }
+def create_embedding_for_pdf_chunks(page,description,url,tags,categories):
+    try:
+        document = collection.find_one({'object_url': url})
+        file_type = document.get("type")
+        mongo_id = str(document.get('_id'))
+        embedding = google_embeddings.embed_query(description)
+        pinecone_id = str(uuid.uuid4())
+        vectors = [{
+            'id': pinecone_id,
+            'values': embedding,
+            'metadata': {
+                'description': description,
+                "url": url,
+                "page_number":page,
+                "tag": file_type,
+                "mongo_id": mongo_id,
+                "tags": ','.join(tags),
+                "categories": ','.join(categories)  # Store MongoDB ID in metadata
+            }
+        }]
+        index.upsert(vectors)
+        print(f"Inserted: page {page} in Pinecone with MongoDB ID {mongo_id} in metadata")
+        collection.update_one(
+            {
+                "_id": document["_id"],
+                "chunks.page_number": page  # Match document and specific chunk by page number
+            },
+            {
+                "$set": {
+                    "chunks.$.pinecone_id": pinecone_id,
+                    "chunks.$.successfully_embedding_created": True
+                }
+            }
+        )
+        return True
+    except Exception as e:
+        print(f"error occured in creating embedding for pdf with mongo id {mongo_id} for page {page}")
+        collection.update_one(
+            {
+                "_id": document["_id"],
+                "chunks.page_number": page  # Match document and specific chunk by page number
+            },
+            {
+                "$set": {
+                    "chunks.$.successfully_embedding_created": False
+                }
+            }
+        )
+        return False
+def process_image_and_create_embedding(page_number, image_path, url, tags, categories):
+    with open(image_path, "rb") as image_file:
+        image_data = base64.b64encode(image_file.read()).decode("utf-8")
+    # Process image using LLM to get description
+    page_result = process_image_using_llm(image_data, page_number, url)
+    # If description is available, create embedding
+    if page_result.get("description"):
+        create_embedding_for_pdf_chunks(page_number, page_result["description"], url, tags, categories)
+    else:
+        print(f"Skipping page {page_number} as description is None")
+    return page_result
+def cleanup_directory(directory_path):
+    try:
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
+        print(f"Cleaned up files in {directory_path}")
+    except Exception as e:
+        print(f"Error cleaning up directory {directory_path}: {e}")
+def process_pdf(url, tags, categories):
+    print(f"Processing PDF with URL: {url}")
+    if download_and_split_pdf_to_image(url):
+        chunks = []
+        image_files = sorted(
+            os.listdir(image_temp_dir),
+            key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
+        )
+        # Use ThreadPoolExecutor to process each page in parallel
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(
+                    process_image_and_create_embedding,
+                    count,
+                    os.path.join(image_temp_dir, image_name),
+                    url,
+                    tags,
+                    categories
+                )
+                for count, image_name in enumerate(image_files, start=1)
+            ]
+            # Collect results as each thread completes
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    page_result = future.result()
+                    chunks.append(page_result)
+                except Exception as e:
+                    print(f"Error processing page: {e}")
+        # Update MongoDB document with the collected chunks
+        collection.update_one(
+            {"object_url": url},
+            {"$set": {"chunks": chunks}},
+            upsert=True
+        )
+        print("Saved chunks to MongoDB.")
+        # Cleanup directories
+        cleanup_directory(pdf_temp_dir)
+        cleanup_directory(image_temp_dir)
+        # Check how many pages failed to create embeddings
+        total_pages = len(chunks)
+        failed_pages = sum(1 for chunk in chunks if not chunk.get("can_find_description"))
+        return failed_pages < total_pages

helper/upload_file_to_s3.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from pymongo import MongoClient
+from datetime import datetime
+import boto3
+import uuid
+import os
+from dotenv import load_dotenv
+load_dotenv()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+s3 = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+)
+def upload_file(file,filetype):
+    try:
+        # Generate a unique key for the file using UUID
+        uuid_str = str(uuid.uuid4())
+        file_name = file.name
+        s3_key = f'MoSPI_files/{uuid_str}-{file_name}'
+        # Upload the image to S3 with ContentType for image files
+        s3.upload_fileobj(
+            file,
+            AWS_BUCKET_NAME,
+            s3_key,
+            ExtraArgs={'ContentType': file.type}  # Set the MIME type of the uploaded file
+        )
+        file_size = file.size
+        upload_time = datetime.now()
+        # Extract date and time separately
+        upload_date = upload_time.strftime('%Y-%m-%d')
+        upload_time_only = upload_time.strftime('%H:%M:%S')
+        # Metadata to MongoDB
+        metadata = {
+            'name': file_name,
+            'size': file_size,
+            'type': filetype,
+            'status': 'unprocessed',
+            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
+            's3_key': s3_key,
+            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
+            'date_uploaded': upload_date,
+            'time_uploaded': upload_time_only,
+            'accuracy': None
+        }
+        # Insert metadata into MongoDB
+        collection.insert_one(metadata)
+        return metadata
+    except Exception as e:
+        print(f"An error occurred during upload: {e}")
+        return None