Jagukumar commited on
Commit
74a5040
·
verified ·
1 Parent(s): 2a212a4

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +88 -0
  2. pin.py +170 -0
  3. processing.py +119 -0
  4. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from processing import extract_text, preprocess_text_generalized
2
+ from pin import initialize_pinecone, handle_file_upload, query_pinecone, get_openai_embeddings
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ import os
6
+ import openai
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # OpenAI and Pinecone settings
12
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
14
+ INDEX_NAME = "document-embeddings"
15
+ EMBEDDING_DIMENSION = 1536 # OpenAI embeddings dimension for `text-embedding-ada-002`
16
+ CLOUD = "aws"
17
+ REGION = "us-east-1"
18
+
19
+ # Set OpenAI API key
20
+ openai.api_key = OPENAI_API_KEY
21
+
22
+ def generate_response(user_query, pinecone_index, namespace="default", model="gpt-3.5-turbo"):
23
+ """
24
+ Generate a response to the user's query using OpenAI GPT and Pinecone for context retrieval.
25
+ """
26
+ # Step 1: Generate query embedding
27
+ query_embedding = get_openai_embeddings(user_query)
28
+
29
+ if query_embedding is None:
30
+ return "Error generating query embedding. Please try again."
31
+
32
+ # Step 2: Retrieve context from Pinecone
33
+ matches = query_pinecone(pinecone_index, query_embedding, namespace=namespace, top_k=5)
34
+ context = " ".join([match["metadata"].get("text", "") for match in matches])
35
+
36
+ # Step 3: Create prompt
37
+ if context.strip():
38
+ prompt = f"Context: {context}\n\nQuestion: {user_query}\n\nAnswer:"
39
+ else:
40
+ # No relevant context found, use a general-purpose prompt
41
+ prompt = f"Question: {user_query}\n\nAnswer:"
42
+
43
+ # Step 4: Generate response using OpenAI GPT
44
+ try:
45
+ response = openai.ChatCompletion.create(
46
+ model=model,
47
+ messages=[
48
+ {"role": "system", "content": "You are a helpful assistant capable of answering general questions and questions based on provided context."},
49
+ {"role": "user", "content": prompt}
50
+ ]
51
+ )
52
+ return response["choices"][0]["message"]["content"]
53
+ except Exception as e:
54
+ return f"Error generating response: {e}"
55
+
56
+
57
+ # Gradio UI for chatbot
58
+ def handle_user_query(file, user_query):
59
+ """
60
+ Handles the entire pipeline: dynamically process new file uploads,
61
+ update embeddings in Pinecone, and generate responses for user queries.
62
+ """
63
+ namespace = "user_session"
64
+ pinecone_index = initialize_pinecone(
65
+ api_key=PINECONE_API_KEY,
66
+ index_name=INDEX_NAME,
67
+ dimension=EMBEDDING_DIMENSION,
68
+ cloud=CLOUD,
69
+ region=REGION,
70
+ )
71
+
72
+ # Process the uploaded file dynamically
73
+ if file:
74
+ handle_file_upload(file.name, pinecone_index, namespace=namespace)
75
+
76
+ # Generate response for the user's query
77
+ return generate_response(user_query, pinecone_index, namespace=namespace)
78
+
79
+ with gr.Blocks() as ui:
80
+ gr.Markdown("# Dynamic Chatbot with Retrieval-Augmented Generation (RAG)")
81
+ file_input = gr.File(label="Upload Document", file_types=[".pdf", ".csv", ".json"])
82
+ user_query = gr.Textbox(label="Your Query", placeholder="Ask a question...")
83
+ chatbot_response = gr.Textbox(label="Chatbot Response", interactive=False)
84
+ submit_button = gr.Button("Submit")
85
+ submit_button.click(handle_user_query, inputs=[file_input, user_query], outputs=chatbot_response)
86
+
87
+ if __name__ == "__main__":
88
+ ui.launch()
pin.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from dotenv import load_dotenv
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ import openai
6
+ import hashlib
7
+ from processing import extract_text, preprocess_text_generalized
8
+
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+
12
+ # Get Pinecone and OpenAI API keys from .env
13
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+ INDEX_NAME = "document-embeddings"
16
+ EMBEDDING_DIMENSION = 1536 # OpenAI's embeddings dimension for `text-embedding-ada-002`
17
+ CLOUD = "aws"
18
+ REGION = "us-east-1"
19
+
20
+ # Set OpenAI API key
21
+ openai.api_key = OPENAI_API_KEY
22
+
23
+
24
+ # Initialize Pinecone
25
+ def initialize_pinecone(api_key, index_name, dimension, cloud="aws", region="us-east-1"):
26
+ """
27
+ Initializes Pinecone and creates an index if it doesn't exist.
28
+ """
29
+ # Create a Pinecone client instance
30
+ pc = Pinecone(api_key=api_key)
31
+
32
+ # Check if the index exists; if not, create it
33
+ if index_name not in pc.list_indexes().names():
34
+ print(f"Index '{index_name}' does not exist. Creating a new index...")
35
+ pc.create_index(
36
+ name=index_name,
37
+ dimension=dimension,
38
+ metric="cosine",
39
+ spec=ServerlessSpec(cloud=cloud, region=region)
40
+ )
41
+
42
+ # Wait for the index to be ready
43
+ while not pc.describe_index(index_name).status["ready"]:
44
+ print("Waiting for index to be ready...")
45
+ time.sleep(1)
46
+
47
+ # Return the Pinecone Index object
48
+ return pc.Index(index_name)
49
+
50
+
51
+ # Save embeddings to Pinecone vector DB
52
+ from pinecone.core.openapi.shared.exceptions import NotFoundException
53
+
54
+
55
+ def save_embeddings_to_pinecone(index, embeddings, metadata, namespace="default"):
56
+ """
57
+ Save embeddings to Pinecone. Clears old embeddings if they exist.
58
+ """
59
+ try:
60
+ # Check if the namespace exists before attempting deletion
61
+ index_description = index.describe_index_stats()
62
+ if namespace in index_description.get("namespaces", {}):
63
+ index.delete(delete_all=True, namespace=namespace)
64
+ print(f"Cleared all previous embeddings in namespace: {namespace}")
65
+ else:
66
+ print(f"Namespace '{namespace}' not found. Proceeding to save new embeddings.")
67
+ except Exception as e:
68
+ print(f"Error while checking/deleting embeddings in namespace {namespace}: {e}")
69
+
70
+ if embeddings:
71
+ vectors = [
72
+ {"id": f"doc_{i}", "values": embedding, "metadata": metadata}
73
+ for i, embedding in enumerate(embeddings)
74
+ ]
75
+ index.upsert(vectors=vectors, namespace=namespace)
76
+ print(f"Saved embeddings to namespace: {namespace}")
77
+ else:
78
+ print("No embeddings to save. Skipping upsert operation.")
79
+
80
+
81
+
82
+ # Generate embeddings using OpenAI API
83
+ def get_openai_embeddings(text, model="text-embedding-ada-002"):
84
+ """
85
+ Generate embeddings for a given text using OpenAI's embedding model.
86
+ Handles splitting text into chunks if it exceeds the token limit.
87
+ """
88
+ max_tokens = 8192 # Adjust based on the model's maximum token limit
89
+ try:
90
+ # Split text into smaller chunks
91
+ chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]
92
+ embeddings = []
93
+ for chunk in chunks:
94
+ response = openai.Embedding.create(input=chunk, model=model)
95
+ embeddings.extend([embedding["embedding"] for embedding in response["data"]])
96
+ return embeddings
97
+ except Exception as e:
98
+ print(f"Error generating embeddings with OpenAI API: {e}")
99
+ return None
100
+
101
+ # Query Pinecone for relevant embeddings
102
+ def query_pinecone(index, query_embedding, namespace="default", top_k=3):
103
+ """
104
+ Retrieve relevant embeddings from Pinecone using similarity search.
105
+ """
106
+ results = index.query(
107
+ vector=query_embedding,
108
+ namespace=namespace,
109
+ top_k=top_k,
110
+ include_metadata=True
111
+ )
112
+ return results["matches"] # Returns the top-k matches with metadata
113
+
114
+
115
+ # Pipeline for handling file uploads and updating Pinecone vector DB
116
+ # Global variable to track the previous file hash
117
+ previous_file_hash = None
118
+
119
+ def calculate_file_hash(file_path):
120
+ """
121
+ Calculate a hash for the uploaded file to uniquely identify it.
122
+ """
123
+ hasher = hashlib.md5()
124
+ with open(file_path, "rb") as f:
125
+ while chunk := f.read(8192):
126
+ hasher.update(chunk)
127
+ return hasher.hexdigest()
128
+
129
+ def handle_file_upload(file_path, pinecone_index, namespace="default"):
130
+ """
131
+ Handle the process of uploading a file, clearing old embeddings,
132
+ and saving new embeddings dynamically.
133
+ """
134
+ global previous_file_hash
135
+
136
+ current_file_hash = calculate_file_hash(file_path)
137
+ if current_file_hash == previous_file_hash:
138
+ print(f"File '{file_path}' is identical to the previously uploaded file. Skipping processing.")
139
+ return
140
+
141
+ try:
142
+ text = extract_text(file_path)
143
+ processed_text = preprocess_text_generalized(text)
144
+
145
+ # Generate embeddings
146
+ embeddings = get_openai_embeddings(processed_text)
147
+ if embeddings:
148
+ metadata = {"file_name": os.path.basename(file_path), "text": processed_text}
149
+ save_embeddings_to_pinecone(pinecone_index, embeddings, metadata, namespace)
150
+ previous_file_hash = current_file_hash
151
+ else:
152
+ print("Failed to generate embeddings. Skipping save operation.")
153
+ except Exception as e:
154
+ print(f"Error processing file upload: {e}")
155
+
156
+
157
+
158
+
159
+ # Example usage
160
+ if __name__ == "__main__":
161
+ # Initialize Pinecone with serverless specifications
162
+ pinecone_index = initialize_pinecone(
163
+ api_key=PINECONE_API_KEY,
164
+ index_name=INDEX_NAME,
165
+ dimension=EMBEDDING_DIMENSION,
166
+ cloud=CLOUD,
167
+ region=REGION
168
+ )
169
+
170
+
processing.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mimetypes
2
+ import pandas as pd
3
+ import PyPDF2
4
+ import json
5
+ import re
6
+ import spacy
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import openai
10
+ import numpy as np
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Set OpenAI API Key
16
+ openai.api_key = os.getenv("OPENAI_API_KEY")
17
+
18
+ # Load SpaCy model
19
+ nlp = spacy.load("en_core_web_sm")
20
+
21
+ # Detect file type
22
+ def detect_file_type(file_path):
23
+ file_type = mimetypes.guess_type(file_path)[0]
24
+ if file_type in ["application/pdf"]:
25
+ return "pdf"
26
+ elif file_type in ["text/csv", "application/vnd.ms-excel"]:
27
+ return "csv"
28
+ elif file_type == "application/json":
29
+ return "json"
30
+ else:
31
+ raise ValueError(f"Unsupported file format: {file_type}")
32
+
33
+ # Extract text from CSV
34
+ def extract_text_from_csv(file_path):
35
+ df = pd.read_csv(file_path)
36
+ text = " ".join(df.astype(str).stack())
37
+ return text
38
+
39
+ # Extract text from PDF
40
+ def extract_text_from_pdf(file_path):
41
+ pdf_reader = PyPDF2.PdfReader(file_path)
42
+ text = ""
43
+ for page in pdf_reader.pages:
44
+ text += page.extract_text()
45
+ return text
46
+
47
+ # Extract text from JSON
48
+ def extract_text_from_json(file_path):
49
+ def recursive_text_extraction(data):
50
+ if isinstance(data, dict):
51
+ return " ".join(recursive_text_extraction(value) for value in data.values())
52
+ elif isinstance(data, list):
53
+ return " ".join(recursive_text_extraction(item) for item in data)
54
+ else:
55
+ return str(data)
56
+
57
+ with open(file_path, 'r') as f:
58
+ data = json.load(f)
59
+ return recursive_text_extraction(data)
60
+
61
+ # Generalized text extraction
62
+ def extract_text(file_path):
63
+ file_type = detect_file_type(file_path)
64
+ if file_type == "csv":
65
+ return extract_text_from_csv(file_path)
66
+ elif file_type == "pdf":
67
+ return extract_text_from_pdf(file_path)
68
+ elif file_type == "json":
69
+ return extract_text_from_json(file_path)
70
+ else:
71
+ raise ValueError("Unsupported file format")
72
+
73
+ # Preprocess text
74
+ def preprocess_text_generalized(text):
75
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text) # Remove URLs
76
+ text = re.sub(r"[^\x20-\x7E]", "", text) # Remove non-ASCII characters
77
+ text = re.sub(r"\s+", " ", text) # Normalize whitespace
78
+ chunk_size = 100000 # Maximum chunk size
79
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
80
+ processed_chunks = []
81
+ for chunk in chunks:
82
+ doc = nlp(chunk.lower())
83
+ tokens = [
84
+ token.lemma_
85
+ for token in doc
86
+ if not token.is_stop and token.is_alpha
87
+ ]
88
+ processed_chunks.append(" ".join(tokens))
89
+ processed_text = " ".join(processed_chunks)
90
+ return processed_text
91
+
92
+ # Generate embeddings using OpenAI API
93
+ def get_openai_embeddings(text, model="text-embedding-ada-002"):
94
+ """
95
+ Generate embeddings for a given text using OpenAI API.
96
+ """
97
+ try:
98
+ response = openai.Embedding.create(input=text, model=model)
99
+ embeddings = response["data"][0]["embedding"]
100
+ return np.array(embeddings) # Convert to NumPy array for compatibility
101
+ except Exception as e:
102
+ print(f"Error generating embeddings: {e}")
103
+ return None
104
+
105
+ # Example usage
106
+ if __name__ == "__main__":
107
+ # Example file path
108
+ file_path = "example.pdf"
109
+
110
+ # Extract and preprocess text
111
+ raw_text = extract_text(file_path)
112
+ preprocessed_text = preprocess_text_generalized(raw_text)
113
+
114
+ # Generate embeddings using OpenAI API
115
+ embeddings = get_openai_embeddings(preprocessed_text)
116
+ if embeddings is not None:
117
+ print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
118
+ else:
119
+ print("Failed to generate embeddings.")
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ pandas
4
+ PyPDF2
5
+ ipykernel
6
+ spacy
7
+ torch
8
+ pinecone
9
+ python-dotenv
10
+ json5
11
+ accelerate==0.26.0
12
+ openai==0.28