Deeksha14 commited on
Commit
9f0715f
·
verified ·
1 Parent(s): f5e88eb

Upload 5 files

Browse files
Files changed (3) hide show
  1. gitattributes +35 -0
  2. requirements.txt +3 -1
  3. streamlit_app.py +28 -48
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt CHANGED
@@ -7,4 +7,6 @@ langchain-google-genai
7
  faiss-cpu
8
  PyPDF2
9
  python-docx
10
- beautifulsoup4
 
 
 
7
  faiss-cpu
8
  PyPDF2
9
  python-docx
10
+ beautifulsoup4
11
+ pinecone-client
12
+
streamlit_app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import streamlit as st
2
  from PyPDF2 import PdfReader
3
  from docx import Document
@@ -5,24 +9,32 @@ from bs4 import BeautifulSoup
5
  import os
6
  import google.generativeai as genai
7
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
8
- from langchain_community.vectorstores import FAISS
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.chains.question_answering import load_qa_chain
11
  from langchain.prompts import PromptTemplate
12
  from dotenv import load_dotenv
 
13
 
14
  # ========================
15
  # 1️⃣ Configuration
16
  # ========================
17
- # Load environment variables and API key
18
  load_dotenv()
19
  api_key = os.getenv("GOOGLE_API_KEY")
20
- if not api_key:
21
- st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
 
 
 
22
  st.stop()
23
 
 
24
  genai.configure(api_key=api_key)
25
 
 
 
 
 
26
  # ========================
27
  # 2️⃣ File Size Limits
28
  # ========================
@@ -37,15 +49,13 @@ def validate_file_sizes(uploaded_files):
37
  st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
38
  return False
39
  total_size += size_mb
40
-
41
  if total_size > MAX_TOTAL_SIZE_MB:
42
  st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
43
  return False
44
-
45
  return True
46
 
47
  # ========================
48
- # 3️⃣ Text Extraction Functions
49
  # ========================
50
  def get_pdf_text(pdf_docs):
51
  text = ""
@@ -67,26 +77,18 @@ def get_html_text(html_file):
67
  return soup.get_text()
68
 
69
  # ========================
70
- # 4️⃣ Text Chunking and Vector Store
71
  # ========================
72
  def get_text_chunks(text):
73
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
74
  return text_splitter.split_text(text)
75
 
76
- def get_vector_store(text_chunks):
77
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
78
- vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
79
-
80
- # ✅ Save to Hugging Face's writable tmp directory
81
- save_path = "/tmp/faiss_index"
82
- vector_store.save_local(save_path)
83
-
84
- return vector_store
85
-
86
-
87
 
88
  # ========================
89
- # 5️⃣ Conversational Chain Setup
90
  # ========================
91
  def get_conversational_chain():
92
  prompt_template = """
@@ -106,48 +108,27 @@ def get_conversational_chain():
106
  return chain
107
 
108
  def user_input(user_question):
109
- save_path = "/tmp/faiss_index"
110
-
111
- if not os.path.exists(f"{save_path}/index.faiss"):
112
- st.error("Vector index not found. Please upload and process documents first.")
113
- return
114
-
115
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
116
- new_db = FAISS.load_local(save_path, embeddings, allow_dangerous_deserialization=True)
117
- docs = new_db.similarity_search(user_question)
118
-
119
  chain = get_conversational_chain()
120
  response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
121
  st.write("Reply:", response["output_text"])
122
 
123
-
124
  # ========================
125
- # 6️⃣ Streamlit App Layout
126
  # ========================
127
  def main():
128
  st.set_page_config(page_title="Chat with Documents")
129
- st.header("Chat with your PDF, DOCX, or HTML using Gemini 💬")
130
 
131
  user_question = st.text_input("Ask a question about your uploaded files:")
132
-
133
  if user_question:
134
- if os.path.exists("/tmp/faiss_index/index.faiss"):
135
- user_input(user_question)
136
- else:
137
- st.warning("Please upload and process documents before asking a question.")
138
-
139
 
140
  with st.sidebar:
141
  st.title("Upload & Process Files")
142
- uploaded_files = st.file_uploader(
143
- "Upload PDF, DOCX, or HTML files (Max 2MB per file, 5MB total)", # ✅ Custom message added here
144
- accept_multiple_files=True,
145
- type=['pdf', 'docx', 'html']
146
-
147
- )
148
-
149
-
150
-
151
 
152
  if st.button("Submit & Process"):
153
  if not uploaded_files:
@@ -168,9 +149,8 @@ def main():
168
  full_text += get_html_text(file)
169
  else:
170
  st.warning(f"Unsupported file type: {file.name}")
171
-
172
  text_chunks = get_text_chunks(full_text)
173
- get_vector_store(text_chunks)
174
  st.success("Processing complete!")
175
 
176
  if __name__ == "__main__":
 
1
+ # ========================
2
+ # 📄 streamlit_app.py
3
+ # Now using Pinecone instead of FAISS
4
+ # ========================
5
  import streamlit as st
6
  from PyPDF2 import PdfReader
7
  from docx import Document
 
9
  import os
10
  import google.generativeai as genai
11
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
12
+ from langchain.vectorstores import Pinecone
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain.chains.question_answering import load_qa_chain
15
  from langchain.prompts import PromptTemplate
16
  from dotenv import load_dotenv
17
+ import pinecone
18
 
19
  # ========================
20
  # 1️⃣ Configuration
21
  # ========================
 
22
  load_dotenv()
23
  api_key = os.getenv("GOOGLE_API_KEY")
24
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
25
+ pinecone_env = os.getenv("PINECONE_ENV") # Example: "gcp-starter"
26
+
27
+ if not api_key or not pinecone_api_key:
28
+ st.error("Missing API key(s). Please check your .env settings.")
29
  st.stop()
30
 
31
+ # Init Gemini
32
  genai.configure(api_key=api_key)
33
 
34
+ # Init Pinecone
35
+ pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
36
+ index_name = "document-chat" # ✅ Must match what you created
37
+
38
  # ========================
39
  # 2️⃣ File Size Limits
40
  # ========================
 
49
  st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
50
  return False
51
  total_size += size_mb
 
52
  if total_size > MAX_TOTAL_SIZE_MB:
53
  st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
54
  return False
 
55
  return True
56
 
57
  # ========================
58
+ # 3️⃣ Text Extraction
59
  # ========================
60
  def get_pdf_text(pdf_docs):
61
  text = ""
 
77
  return soup.get_text()
78
 
79
  # ========================
80
+ # 4️⃣ Chunking + Pinecone
81
  # ========================
82
  def get_text_chunks(text):
83
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
84
  return text_splitter.split_text(text)
85
 
86
+ def push_to_pinecone(chunks):
87
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
88
+ Pinecone.from_texts(texts=chunks, embedding=embeddings, index_name=index_name)
 
 
 
 
 
 
 
 
89
 
90
  # ========================
91
+ # 5️⃣ Q&A Chain
92
  # ========================
93
  def get_conversational_chain():
94
  prompt_template = """
 
108
  return chain
109
 
110
  def user_input(user_question):
 
 
 
 
 
 
111
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
112
+ vectorstore = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
113
+ docs = vectorstore.similarity_search(user_question)
 
114
  chain = get_conversational_chain()
115
  response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
116
  st.write("Reply:", response["output_text"])
117
 
 
118
  # ========================
119
+ # 6️⃣ Streamlit UI
120
  # ========================
121
  def main():
122
  st.set_page_config(page_title="Chat with Documents")
123
+ st.header("Chat with your PDF, DOCX, or HTML using Gemini + Pinecone")
124
 
125
  user_question = st.text_input("Ask a question about your uploaded files:")
 
126
  if user_question:
127
+ user_input(user_question)
 
 
 
 
128
 
129
  with st.sidebar:
130
  st.title("Upload & Process Files")
131
+ uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files (Max 2MB per file, 5MB total)", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
 
 
 
 
 
 
 
 
132
 
133
  if st.button("Submit & Process"):
134
  if not uploaded_files:
 
149
  full_text += get_html_text(file)
150
  else:
151
  st.warning(f"Unsupported file type: {file.name}")
 
152
  text_chunks = get_text_chunks(full_text)
153
+ push_to_pinecone(text_chunks)
154
  st.success("Processing complete!")
155
 
156
  if __name__ == "__main__":