import os import streamlit as st import fitz # PyMuPDF from google.cloud import language_v1 import requests import json from dotenv import load_dotenv from pinecone import Pinecone, ServerlessSpec # Load the environment variables from the .env file load_dotenv() google_api_key = os.getenv('GOOGLE_API_KEY') pinecone_api_key = os.getenv('PINECONE_API_KEY') # Initialize Pinecone try: pc = Pinecone(api_key=pinecone_api_key) except Exception as e: st.error(f"Error initializing Pinecone: {e}") st.stop() index_name = 'pdf-analysis' if index_name not in pc.list_indexes().names(): try: pc.create_index( name=index_name, dimension=768, metric='euclidean', spec=ServerlessSpec( cloud='aws', region='us-west-2' ) ) except Exception as e: st.error(f"Error creating Pinecone index: {e}") st.stop() # Function to analyze entities and get embeddings using the API key def get_embeddings(text, api_key): url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}" headers = { "Content-Type": "application/json", } data = { "document": { "type": "PLAIN_TEXT", "content": text }, "encodingType": "UTF8" } try: response = requests.post(url, headers=headers, json=data) response.raise_for_status() embeddings = response.json() return embeddings except requests.exceptions.RequestException as e: st.error(f"Error getting embeddings: {e}") return None # Streamlit app st.title("Chat with Your Document") st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.") # File upload uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: try: # Load the PDF file pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf") pdf_text = "" for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) pdf_text += page.get_text() # Get embeddings for the PDF text embeddings = get_embeddings(pdf_text, google_api_key) if embeddings is None: st.stop() vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])] # Create or connect to Pinecone index index = pc.Index(index_name) index.upsert(vectors) # Chat with the document user_input = st.text_input("Ask a question about the document:") if st.button("Ask"): if user_input: # Get embeddings for the user query user_query_embeddings = get_embeddings(user_input, google_api_key) if user_query_embeddings is None: st.stop() query_vector = user_query_embeddings['entities'][0]['name'] # Perform similarity search results = index.query(query_vector, top_k=5) response_text = "Relevant information from the document:\n" for result in results['matches']: response_text += f"Text: {result['text']}, Score: {result['score']}\n" st.write(response_text.strip()) else: st.write("Please enter a question to ask.") # Display the PDF text st.write("Extracted Text from PDF:") st.write(pdf_text) except Exception as e: st.error(f"Error processing PDF file: {e}") ##