Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from langchain_community.embeddings import OpenAIEmbeddings | |
| from pinecone import Pinecone, ServerlessSpec | |
| from tqdm.notebook import tqdm | |
| import langchain | |
| import openai | |
| from openai import OpenAI | |
| import string | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader | |
| import os | |
| def get_text_from_document(document): | |
| # Assuming 'text' is a list of Document objects, each with a 'page_content' attribute | |
| # Concatenate the page_content of each Document into a single text string | |
| # text = "".join([doc.page_content for doc in document]) | |
| text = "".join(document).replace('\n\n', '\n') | |
| # Now, 'full_text' should contain the actual text extracted from the PDF | |
| print(f"Total length of text: {len(text)} characters") | |
| # If you want to see a part of the extracted text | |
| print(text[:1000]) # Adjust the number as necessary to inspect more of the text | |
| return text | |
| # Function to get the embeddings of the text using OpenAI text-embedding-ada-002 model | |
| def get_embedding(text, model="text-embedding-ada-002"): | |
| text = text.replace("\n", " ") | |
| openai_key = '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' | |
| client = OpenAI(api_key=openai_key) | |
| return client.embeddings.create(input=[text], model=model).data[0].embedding | |
| ## TODO: Function to query the Pinecone vector store and return the top-k results | |
| def query_pinecone_vector_store(query, top_k=5): | |
| # Generate an embedding for the query | |
| query_embedding = get_embedding(query) | |
| # pc = Pinecone(api_key="c25f9e89-fc9e-4d21-b3eb-057dbc21c17c") | |
| pc = Pinecone(api_key="52ef9136-6188-4e51-af13-9639bf95c163") | |
| pinecone_index_name = "ee596llm-project2" | |
| index = pc.Index(pinecone_index_name) | |
| # Query the Pinecone index with the generated embedding | |
| query_results = index.query( | |
| vector=query_embedding, | |
| top_k=top_k, | |
| include_metadata=True | |
| ) | |
| # Extract and return the most relevant documents along with their scores | |
| relevant_docs = [ | |
| (result['id'], result['score'], result['metadata']['text']) | |
| for result in query_results['matches'] | |
| ] | |
| return relevant_docs | |
| def get_completion(prompt, model="gpt-3.5-turbo"): | |
| message = {"role": "user", "content": prompt} | |
| client = OpenAI(api_key='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') | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[message] | |
| ) | |
| return response.choices[0].message.content | |
| def generate_answer_with_context(query, results): | |
| # Construct the prompt with the top-k results as context | |
| context_texts = "\n\n".join( | |
| [f"Context {idx + 1}: {result[1]}" for idx, result in enumerate(results)]) # Assuming result[1] is the text | |
| print(f"context_texts is : {context_texts} \n\n\n") | |
| prompt = f"Given the following contexts related to the query '{query}', provide a detailed answer:\n\n{context_texts}\n\nAnswer the query:" | |
| # Generate the answer using the GPT-3.5 Turbo model with the constructed prompt | |
| answer = get_completion(prompt, model="gpt-3.5-turbo") | |
| return answer | |
| class Relevant_Documents_Agent: | |
| def __init__(self, openai_client) -> None: | |
| # TODO: Initialize the Relevant_Documents_Agent | |
| self.openai_client = openai_client | |
| def get_relevance(self, conversation) -> str: | |
| # TODO: Get if the returned documents are relevant | |
| # Generate embeddings for the query and all documents | |
| top_k_results = query_pinecone_vector_store(conversation, top_k=4) | |
| answer = generate_answer_with_context(conversation, top_k_results) | |
| most_relevant_document = answer | |
| return most_relevant_document | |
| def compute_cosine_similarity(self, vec1, vec2): | |
| # Ensure the vectors are numpy arrays for mathematical operations | |
| vec1 = np.array(vec1) | |
| vec2 = np.array(vec2) | |
| # Compute the cosine similarity | |
| dot_product = np.dot(vec1, vec2) | |
| norm_vec1 = np.linalg.norm(vec1) | |
| norm_vec2 = np.linalg.norm(vec2) | |
| cosine_similarity = dot_product / (norm_vec1 * norm_vec2) | |
| return cosine_similarity | |