Spaces:

Batrdj
/

Bot

Sleeping

App Files Files Community

Batrdj commited on Feb 24, 2025

Commit

6792445

verified ·

1 Parent(s): eb7df7d

Upload 4 files

Browse files

Files changed (4) hide show

app.py +20 -0
create.py +185 -0
final.py +130 -0
requirements.txt +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import streamlit as st
+PAGES = {
+    "Chat": "final.py",
+    "Admin": "admin.py"
+}
+def main():
+    selection = st.sidebar.radio("Go to", list(PAGES.keys()))
+    page = PAGES[selection]
+    if page == PAGES["Chat"]:
+        import final
+        final.main()
+    elif page == PAGES["Admin"]:
+        import admin
+        admin.main()
+if __name__ == "__main__":
+    main()

create.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# import os
+# from pathlib import Path
+# import cv2
+# import pytesseract
+# from PIL import Image
+# from docx import Document
+# from pptx import Presentation
+# from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_community.vectorstores import FAISS
+# from langchain.schema import Document as LangchainDocument  # ✅ Ensure correct Document format
+# from dotenv import load_dotenv, find_dotenv
+# # Load environment variables
+# load_dotenv(find_dotenv())
+# # Paths
+# DATA_PATH = "data/"
+# DB_FAISS_PATH = "vectorstore/db_faiss"
+# # Set Tesseract OCR Path (update this based on your installation)
+# pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
+# # Step 1: Load Documents from Multiple Sources
+# def load_documents(data_path):
+#     documents = []
+#     # Load PDFs
+#     pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
+#     documents.extend(pdf_loader.load())  # PDFs are already in Document format
+#     # Load Word files
+#     for file in Path(data_path).glob("*.docx"):
+#         doc = Document(file)
+#         text = "\n".join([para.text for para in doc.paragraphs])
+#         documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
+#     # Load PowerPoint files
+#     for file in Path(data_path).glob("*.pptx"):
+#         prs = Presentation(file)
+#         text = ""
+#         for slide in prs.slides:
+#             for shape in slide.shapes:
+#                 if hasattr(shape, "text"):
+#                     text += shape.text + "\n"
+#         documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
+#     # Load Images (OCR)
+#     for image_file in Path(data_path).glob("*.jpg"):
+#         img = cv2.imread(str(image_file))
+#         text = pytesseract.image_to_string(img)
+#         documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
+#     for image_file in Path(data_path).glob("*.png"):
+#         img = cv2.imread(str(image_file))
+#         text = pytesseract.image_to_string(img)
+#         documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
+#     print(f"✅ Loaded {len(documents)} documents from {data_path}")
+#     return documents
+# # Step 2: Create Chunks
+# def create_chunks(documents):
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+#     text_chunks = text_splitter.split_documents(documents)
+#     print(f"✅ Created {len(text_chunks)} text chunks")
+#     return text_chunks
+# # Step 3: Create Vector Embeddings
+# def get_embedding_model():
+#     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# # Step 4: Store embeddings in FAISS
+# def create_vector_store(text_chunks):
+#     embedding_model = get_embedding_model()
+#     print("🔄 Creating vector store...")
+#     db = FAISS.from_documents(text_chunks, embedding_model)
+#     db.save_local(DB_FAISS_PATH)
+#     print("✅ Vector store created/updated successfully.")
+# # Step 5: Main Execution
+# if __name__ == "__main__":
+#     print("🚀 Starting process...")
+#     documents = load_documents(DATA_PATH)
+#     text_chunks = create_chunks(documents)
+#     create_vector_store(text_chunks)
+#     print("🎉 Process completed successfully!")
+import os
+from pathlib import Path
+import cv2
+import pytesseract
+from PIL import Image
+from docx import Document
+from pptx import Presentation
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document as LangchainDocument
+from dotenv import load_dotenv, find_dotenv
+# Load environment variables
+load_dotenv(find_dotenv())
+# Paths
+DATA_PATH = "data/"
+DB_FAISS_PATH = "vectorstore/db_faiss"
+# Set Tesseract OCR Path (update this based on your installation)
+pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
+# Function to extract text from images
+def extract_text_from_image(image_path):
+    img = cv2.imread(str(image_path))
+    if img is None:
+        print(f"⚠️ Warning: Unable to read image {image_path}")
+        return ""
+    text = pytesseract.image_to_string(img)
+    return text.strip()
+# Step 1: Load Documents from Multiple Sources
+def load_documents(data_path):
+    documents = []
+    # Load PDFs
+    pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
+    documents.extend(pdf_loader.load())
+    # Load Word files
+    for file in Path(data_path).glob("*.docx"):
+        doc = Document(file)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
+    # Load PowerPoint files
+    for file in Path(data_path).glob("*.pptx"):
+        prs = Presentation(file)
+        for i, slide in enumerate(prs.slides):
+            text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
+            if text.strip():
+                documents.append(LangchainDocument(page_content=text, metadata={"source": file.name, "slide": i + 1}))
+    # Load Images (OCR) - JPG and PNG
+    for image_file in Path(data_path).rglob("*.jpg"):
+        text = extract_text_from_image(image_file)
+        if text:
+            documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
+    for image_file in Path(data_path).rglob("*.png"):
+        text = extract_text_from_image(image_file)
+        if text:
+            documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
+    print(f"✅ Loaded {len(documents)} documents from {data_path}")
+    return documents
+# Step 2: Create Chunks
+def create_chunks(documents):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    text_chunks = text_splitter.split_documents(documents)
+    print(f"✅ Created {len(text_chunks)} text chunks")
+    return text_chunks
+# Step 3: Create Vector Embeddings
+def get_embedding_model():
+    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Step 4: Store embeddings in FAISS
+def create_vector_store(text_chunks):
+    embedding_model = get_embedding_model()
+    print("🔄 Creating vector store...")
+    db = FAISS.from_documents(text_chunks, embedding_model)
+    db.save_local(DB_FAISS_PATH)
+    print("✅ Vector store created/updated successfully.")
+# Step 5: Main Execution
+if __name__ == "__main__":
+    print("🚀 Starting process...")
+    documents = load_documents(DATA_PATH)
+    text_chunks = create_chunks(documents)
+    create_vector_store(text_chunks)
+    print("🎉 Process completed successfully!")

final.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import streamlit as st
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_community.vectorstores import FAISS
+from langchain_core.prompts import PromptTemplate
+from langchain_huggingface import HuggingFaceEndpoint
+from dotenv import load_dotenv, find_dotenv
+# ✅ Load environment variables
+load_dotenv(find_dotenv())
+# ✅ FAISS Database Path
+DB_FAISS_PATH = "vectorstore/db_faiss"
+@st.cache_resource
+def get_vectorstore():
+    """Loads the FAISS vector store with embeddings."""
+    try:
+        embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+        return FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
+    except Exception as e:
+        st.error(f"⚠️ Error loading vector store: {str(e)}")
+        return None
+@st.cache_resource
+def load_llm():
+    """Loads the Hugging Face LLM model for text generation."""
+    HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
+    HF_TOKEN = os.getenv("HF_TOKEN")
+    if not HF_TOKEN:
+        st.error("⚠️ Hugging Face API token is missing. Please check your environment variables.")
+        return None
+    try:
+        return HuggingFaceEndpoint(
+            repo_id=HUGGINGFACE_REPO_ID,
+            task="text-generation",
+            temperature=0.3,
+            model_kwargs={"token": HF_TOKEN, "max_length": 256}
+        )
+    except Exception as e:
+        st.error(f"⚠️ Error loading LLM: {str(e)}")
+        return None
+def set_custom_prompt():
+    """Defines the chatbot's behavior with a custom prompt template."""
+    return PromptTemplate(
+        template="""
+        You are an SEO chatbot with advanced knowledge. Answer based **strictly** on the provided documents.
+        If the answer is in the context, provide a **clear, professional, and concise** response with sources.
+        If the question is **outside the given context**, politely decline:
+        **"I'm sorry, but I can only provide answers based on the available documents."**
+        **Context:** {context}
+        **Question:** {question}
+        **Answer:**
+        """,
+        input_variables=["context", "question"]
+    )
+def generate_response(prompt, vectorstore, llm):
+    """Retrieves relevant documents and generates a response from the LLM."""
+    if not vectorstore or not llm:
+        return "❌ Unable to process your request due to initialization issues."
+    try:
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={'k': 3}),
+            return_source_documents=True,
+            chain_type_kwargs={'prompt': set_custom_prompt()}
+        )
+        response_data = qa_chain.invoke({'query': prompt})
+        result = response_data.get("result", "")
+        source_documents = response_data.get("source_documents", [])
+        if not result or not source_documents:
+            return "❌ Sorry, but I can only provide answers based on the available documents."
+        formatted_sources = "\n\n📚 **Sources:**" + "".join(
+            [f"\n- {doc.metadata.get('source', 'Unknown')} (Page: {doc.metadata.get('page', 'N/A')})" for doc in source_documents]
+        )
+        return f"{result}{formatted_sources}"
+    except Exception as e:
+        return f"⚠️ **Error:** {str(e)}"
+def main():
+    """Runs the Streamlit chatbot application."""
+    st.title("🧠 Brainmines SEO Chatbot - Your AI Assistant for SEO Queries 🚀")
+    # ✅ Load vector store and LLM
+    vectorstore = get_vectorstore()
+    llm = load_llm()
+    if not vectorstore or not llm:
+        st.error("⚠️ Failed to initialize vector store or LLM. Please check configurations.")
+        return
+    # ✅ Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = [
+            {"role": "assistant", "content": "Hello! 👋 I'm here to assist you with SEO-related queries. 🚀"},
+        ]
+    # ✅ Display chat history
+    for message in st.session_state.messages:
+        st.chat_message(message["role"]).markdown(message["content"])
+    prompt = st.chat_input("💬 Enter your SEO question here")
+    if prompt:
+        st.chat_message("user").markdown(prompt)
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.spinner("Thinking... 🤔"):
+            response = generate_response(prompt, vectorstore, llm)
+        st.chat_message("assistant").markdown(response)
+        st.session_state.messages.append({"role": "assistant", "content": response})
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+streamlit
+langchain
+langchain-community
+langchain-huggingface
+dotenv
+faiss-cpu
+pytesseract
+pillow
+opencv-python-headless
+python-docx
+python-pptx
+pandas
+numpy
+huggingface_hub
+requests
+transformers
+sentence-transformers
+torch