Batrdj commited on
Commit
6792445
Β·
verified Β·
1 Parent(s): eb7df7d

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +20 -0
  2. create.py +185 -0
  3. final.py +130 -0
  4. requirements.txt +18 -0
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ PAGES = {
4
+ "Chat": "final.py",
5
+ "Admin": "admin.py"
6
+ }
7
+
8
+ def main():
9
+ selection = st.sidebar.radio("Go to", list(PAGES.keys()))
10
+ page = PAGES[selection]
11
+
12
+ if page == PAGES["Chat"]:
13
+ import final
14
+ final.main()
15
+ elif page == PAGES["Admin"]:
16
+ import admin
17
+ admin.main()
18
+
19
+ if __name__ == "__main__":
20
+ main()
create.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # from pathlib import Path
3
+ # import cv2
4
+ # import pytesseract
5
+ # from PIL import Image
6
+ # from docx import Document
7
+ # from pptx import Presentation
8
+ # from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
9
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ # from langchain_huggingface import HuggingFaceEmbeddings
11
+ # from langchain_community.vectorstores import FAISS
12
+ # from langchain.schema import Document as LangchainDocument # βœ… Ensure correct Document format
13
+ # from dotenv import load_dotenv, find_dotenv
14
+
15
+ # # Load environment variables
16
+ # load_dotenv(find_dotenv())
17
+
18
+ # # Paths
19
+ # DATA_PATH = "data/"
20
+ # DB_FAISS_PATH = "vectorstore/db_faiss"
21
+
22
+ # # Set Tesseract OCR Path (update this based on your installation)
23
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
24
+
25
+ # # Step 1: Load Documents from Multiple Sources
26
+ # def load_documents(data_path):
27
+ # documents = []
28
+
29
+ # # Load PDFs
30
+ # pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
31
+ # documents.extend(pdf_loader.load()) # PDFs are already in Document format
32
+
33
+ # # Load Word files
34
+ # for file in Path(data_path).glob("*.docx"):
35
+ # doc = Document(file)
36
+ # text = "\n".join([para.text for para in doc.paragraphs])
37
+ # documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
38
+
39
+ # # Load PowerPoint files
40
+ # for file in Path(data_path).glob("*.pptx"):
41
+ # prs = Presentation(file)
42
+ # text = ""
43
+ # for slide in prs.slides:
44
+ # for shape in slide.shapes:
45
+ # if hasattr(shape, "text"):
46
+ # text += shape.text + "\n"
47
+ # documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
48
+
49
+ # # Load Images (OCR)
50
+ # for image_file in Path(data_path).glob("*.jpg"):
51
+ # img = cv2.imread(str(image_file))
52
+ # text = pytesseract.image_to_string(img)
53
+ # documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
54
+
55
+ # for image_file in Path(data_path).glob("*.png"):
56
+ # img = cv2.imread(str(image_file))
57
+ # text = pytesseract.image_to_string(img)
58
+ # documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
59
+
60
+ # print(f"βœ… Loaded {len(documents)} documents from {data_path}")
61
+ # return documents
62
+
63
+ # # Step 2: Create Chunks
64
+ # def create_chunks(documents):
65
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
66
+ # text_chunks = text_splitter.split_documents(documents)
67
+ # print(f"βœ… Created {len(text_chunks)} text chunks")
68
+ # return text_chunks
69
+
70
+ # # Step 3: Create Vector Embeddings
71
+ # def get_embedding_model():
72
+ # return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
73
+
74
+ # # Step 4: Store embeddings in FAISS
75
+ # def create_vector_store(text_chunks):
76
+ # embedding_model = get_embedding_model()
77
+ # print("πŸ”„ Creating vector store...")
78
+ # db = FAISS.from_documents(text_chunks, embedding_model)
79
+ # db.save_local(DB_FAISS_PATH)
80
+ # print("βœ… Vector store created/updated successfully.")
81
+
82
+ # # Step 5: Main Execution
83
+ # if __name__ == "__main__":
84
+ # print("πŸš€ Starting process...")
85
+ # documents = load_documents(DATA_PATH)
86
+ # text_chunks = create_chunks(documents)
87
+ # create_vector_store(text_chunks)
88
+ # print("πŸŽ‰ Process completed successfully!")
89
+
90
+
91
+ import os
92
+ from pathlib import Path
93
+ import cv2
94
+ import pytesseract
95
+ from PIL import Image
96
+ from docx import Document
97
+ from pptx import Presentation
98
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
99
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
100
+ from langchain_huggingface import HuggingFaceEmbeddings
101
+ from langchain_community.vectorstores import FAISS
102
+ from langchain.schema import Document as LangchainDocument
103
+ from dotenv import load_dotenv, find_dotenv
104
+
105
+ # Load environment variables
106
+ load_dotenv(find_dotenv())
107
+
108
+ # Paths
109
+ DATA_PATH = "data/"
110
+ DB_FAISS_PATH = "vectorstore/db_faiss"
111
+
112
+ # Set Tesseract OCR Path (update this based on your installation)
113
+ pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
114
+
115
+ # Function to extract text from images
116
+ def extract_text_from_image(image_path):
117
+ img = cv2.imread(str(image_path))
118
+ if img is None:
119
+ print(f"⚠️ Warning: Unable to read image {image_path}")
120
+ return ""
121
+ text = pytesseract.image_to_string(img)
122
+ return text.strip()
123
+
124
+ # Step 1: Load Documents from Multiple Sources
125
+ def load_documents(data_path):
126
+ documents = []
127
+
128
+ # Load PDFs
129
+ pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
130
+ documents.extend(pdf_loader.load())
131
+
132
+ # Load Word files
133
+ for file in Path(data_path).glob("*.docx"):
134
+ doc = Document(file)
135
+ text = "\n".join([para.text for para in doc.paragraphs])
136
+ documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
137
+
138
+ # Load PowerPoint files
139
+ for file in Path(data_path).glob("*.pptx"):
140
+ prs = Presentation(file)
141
+ for i, slide in enumerate(prs.slides):
142
+ text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
143
+ if text.strip():
144
+ documents.append(LangchainDocument(page_content=text, metadata={"source": file.name, "slide": i + 1}))
145
+
146
+ # Load Images (OCR) - JPG and PNG
147
+ for image_file in Path(data_path).rglob("*.jpg"):
148
+ text = extract_text_from_image(image_file)
149
+ if text:
150
+ documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
151
+
152
+ for image_file in Path(data_path).rglob("*.png"):
153
+ text = extract_text_from_image(image_file)
154
+ if text:
155
+ documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
156
+
157
+ print(f"βœ… Loaded {len(documents)} documents from {data_path}")
158
+ return documents
159
+
160
+ # Step 2: Create Chunks
161
+ def create_chunks(documents):
162
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
163
+ text_chunks = text_splitter.split_documents(documents)
164
+ print(f"βœ… Created {len(text_chunks)} text chunks")
165
+ return text_chunks
166
+
167
+ # Step 3: Create Vector Embeddings
168
+ def get_embedding_model():
169
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
170
+
171
+ # Step 4: Store embeddings in FAISS
172
+ def create_vector_store(text_chunks):
173
+ embedding_model = get_embedding_model()
174
+ print("πŸ”„ Creating vector store...")
175
+ db = FAISS.from_documents(text_chunks, embedding_model)
176
+ db.save_local(DB_FAISS_PATH)
177
+ print("βœ… Vector store created/updated successfully.")
178
+
179
+ # Step 5: Main Execution
180
+ if __name__ == "__main__":
181
+ print("πŸš€ Starting process...")
182
+ documents = load_documents(DATA_PATH)
183
+ text_chunks = create_chunks(documents)
184
+ create_vector_store(text_chunks)
185
+ print("πŸŽ‰ Process completed successfully!")
final.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.chains import RetrievalQA
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_core.prompts import PromptTemplate
7
+ from langchain_huggingface import HuggingFaceEndpoint
8
+ from dotenv import load_dotenv, find_dotenv
9
+
10
+ # βœ… Load environment variables
11
+ load_dotenv(find_dotenv())
12
+
13
+ # βœ… FAISS Database Path
14
+ DB_FAISS_PATH = "vectorstore/db_faiss"
15
+
16
+ @st.cache_resource
17
+ def get_vectorstore():
18
+ """Loads the FAISS vector store with embeddings."""
19
+ try:
20
+ embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
21
+ return FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
22
+ except Exception as e:
23
+ st.error(f"⚠️ Error loading vector store: {str(e)}")
24
+ return None
25
+
26
+ @st.cache_resource
27
+ def load_llm():
28
+ """Loads the Hugging Face LLM model for text generation."""
29
+ HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
30
+ HF_TOKEN = os.getenv("HF_TOKEN")
31
+
32
+ if not HF_TOKEN:
33
+ st.error("⚠️ Hugging Face API token is missing. Please check your environment variables.")
34
+ return None
35
+
36
+ try:
37
+ return HuggingFaceEndpoint(
38
+ repo_id=HUGGINGFACE_REPO_ID,
39
+ task="text-generation",
40
+ temperature=0.3,
41
+ model_kwargs={"token": HF_TOKEN, "max_length": 256}
42
+ )
43
+ except Exception as e:
44
+ st.error(f"⚠️ Error loading LLM: {str(e)}")
45
+ return None
46
+
47
+ def set_custom_prompt():
48
+ """Defines the chatbot's behavior with a custom prompt template."""
49
+ return PromptTemplate(
50
+ template="""
51
+ You are an SEO chatbot with advanced knowledge. Answer based **strictly** on the provided documents.
52
+
53
+ If the answer is in the context, provide a **clear, professional, and concise** response with sources.
54
+ If the question is **outside the given context**, politely decline:
55
+
56
+ **"I'm sorry, but I can only provide answers based on the available documents."**
57
+
58
+ **Context:** {context}
59
+ **Question:** {question}
60
+
61
+ **Answer:**
62
+ """,
63
+ input_variables=["context", "question"]
64
+ )
65
+
66
+ def generate_response(prompt, vectorstore, llm):
67
+ """Retrieves relevant documents and generates a response from the LLM."""
68
+ if not vectorstore or not llm:
69
+ return "❌ Unable to process your request due to initialization issues."
70
+
71
+ try:
72
+ qa_chain = RetrievalQA.from_chain_type(
73
+ llm=llm,
74
+ chain_type="stuff",
75
+ retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
76
+ return_source_documents=True,
77
+ chain_type_kwargs={'prompt': set_custom_prompt()}
78
+ )
79
+
80
+ response_data = qa_chain.invoke({'query': prompt})
81
+ result = response_data.get("result", "")
82
+ source_documents = response_data.get("source_documents", [])
83
+
84
+ if not result or not source_documents:
85
+ return "❌ Sorry, but I can only provide answers based on the available documents."
86
+
87
+ formatted_sources = "\n\nπŸ“š **Sources:**" + "".join(
88
+ [f"\n- {doc.metadata.get('source', 'Unknown')} (Page: {doc.metadata.get('page', 'N/A')})" for doc in source_documents]
89
+ )
90
+ return f"{result}{formatted_sources}"
91
+
92
+ except Exception as e:
93
+ return f"⚠️ **Error:** {str(e)}"
94
+
95
+ def main():
96
+ """Runs the Streamlit chatbot application."""
97
+ st.title("🧠 Brainmines SEO Chatbot - Your AI Assistant for SEO Queries πŸš€")
98
+
99
+ # βœ… Load vector store and LLM
100
+ vectorstore = get_vectorstore()
101
+ llm = load_llm()
102
+
103
+ if not vectorstore or not llm:
104
+ st.error("⚠️ Failed to initialize vector store or LLM. Please check configurations.")
105
+ return
106
+
107
+ # βœ… Initialize session state
108
+ if "messages" not in st.session_state:
109
+ st.session_state.messages = [
110
+ {"role": "assistant", "content": "Hello! πŸ‘‹ I'm here to assist you with SEO-related queries. πŸš€"},
111
+ ]
112
+
113
+ # βœ… Display chat history
114
+ for message in st.session_state.messages:
115
+ st.chat_message(message["role"]).markdown(message["content"])
116
+
117
+ prompt = st.chat_input("πŸ’¬ Enter your SEO question here")
118
+
119
+ if prompt:
120
+ st.chat_message("user").markdown(prompt)
121
+ st.session_state.messages.append({"role": "user", "content": prompt})
122
+
123
+ with st.spinner("Thinking... πŸ€”"):
124
+ response = generate_response(prompt, vectorstore, llm)
125
+
126
+ st.chat_message("assistant").markdown(response)
127
+ st.session_state.messages.append({"role": "assistant", "content": response})
128
+
129
+ if __name__ == "__main__":
130
+ main()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain-community
4
+ langchain-huggingface
5
+ dotenv
6
+ faiss-cpu
7
+ pytesseract
8
+ pillow
9
+ opencv-python-headless
10
+ python-docx
11
+ python-pptx
12
+ pandas
13
+ numpy
14
+ huggingface_hub
15
+ requests
16
+ transformers
17
+ sentence-transformers
18
+ torch