Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 16, 2025

Commit

b58d6fd

verified ·

1 Parent(s): 7afdcd2

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -201

app.py CHANGED Viewed

@@ -1,225 +1,225 @@
-# import os
-# import streamlit as st
-# import fitz  # PyMuPDF
-# import logging
-# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-# from langchain.text_splitter import RecursiveCharacterTextSplitter
-# from langchain_community.vectorstores import Chroma
-# from langchain_community.embeddings import SentenceTransformerEmbeddings
-# from langchain_community.llms import HuggingFacePipeline
-# from langchain.chains import RetrievalQA
-# from langchain.prompts import PromptTemplate
-# from langchain_community.document_loaders import TextLoader
-# # --- Configuration ---
-# st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
-# st.title("📚 RAG-based PDF Chatbot")
-# device = "cpu"
-# # --- Logging ---
-# logging.basicConfig(level=logging.INFO)
-# # --- Load LLM ---
-# @st.cache_resource
-# def load_model():
-#     checkpoint = "MBZUAI/LaMini-T5-738M"
-#     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-#     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-#     pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
-#     return HuggingFacePipeline(pipeline=pipe)
-# # --- Extract PDF Text ---
-# def read_pdf(file):
-#     try:
-#         doc = fitz.open(stream=file.read(), filetype="pdf")
-#         text = ""
-#         for page in doc:
-#             text += page.get_text()
-#         return text.strip()
-#     except Exception as e:
-#         logging.error(f"Failed to extract text: {e}")
-#         return ""
-# # --- Process Answer ---dd
-# def process_answer(question, full_text):
-#     # Save the full_text to a temporary file
-#     with open("temp_text.txt", "w") as f:
-#         f.write(full_text)
-#     loader = TextLoader("temp_text.txt")
-#     docs = loader.load()
-#     # Chunk the documents with increased size and overlap
-#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
-#     splits = text_splitter.split_documents(docs)
-#     # Load embeddings
-#     embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-#     # Create Chroma in-memory vector store
-#     db = Chroma.from_documents(splits, embedding=embeddings)
-#     retriever = db.as_retriever()
-#     # Set up the model
-#     llm = load_model()
-#     # Create a custom prompt
-#     prompt_template = PromptTemplate(
-#     input_variables=["context", "question"],
-#     template="""
-#     You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
-#     Context:
-#     {context}
-#     Question:
-#     {question}
-#     Important Instructions:
-#     - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
-#     - Do NOT summarize or paraphrase.
-#     - If the information is not in the context, say "Not found in the document."
-#     Answer:
-#     """)
-#     # Retrieval QA with custom prompt
-#     qa_chain = RetrievalQA.from_chain_type(
-#         llm=llm,
-#         retriever=retriever,
-#         chain_type="stuff",
-#         chain_type_kwargs={"prompt": prompt_template}
-#     )
-#     # Return the answer using the retrieval QA chain
-#     return qa_chain.run(question)
-# # --- UI Layout ---
-# with st.sidebar:
-#     st.header("📄 Upload PDF")
-#     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
-# # --- Main Interface ---
-# if uploaded_file:
-#     st.success(f"You uploaded: {uploaded_file.name}")
-#     full_text = read_pdf(uploaded_file)
-#     if full_text:
-#         st.subheader("📁 PDF Preview")
-#         with st.expander("View Extracted Text"):
-#             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
-#         st.subheader("💬 Ask a Question")
-#         user_question = st.text_input("Type your question about the PDF content")
-#         if user_question:
-#             with st.spinner("Thinking..."):
-#                 answer = process_answer(user_question, full_text)
-#                 st.markdown("### 🤖 Answer")
-#                 st.write(answer)
-#         with st.sidebar:
-#             st.markdown("---")
-#             st.markdown("**💡 Suggestions:**")
-#             st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
-#         with st.expander("💡 Suggestions", expanded=True):
-#             st.markdown("""
-#             - "Summarize this document"
-#             - "Give a quick summary"
-#             - "What are the main points?"
-#             - "Explain this document in short"
-#             """)
-#     else:
-#         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
-# else:
-#     st.info("Upload a PDF to begin.")
-import os
-import streamlit as st
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-from langchain.llms import HuggingFaceHub
-# Set your Hugging Face API token here
-os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
-# Load and split PDF
-def load_and_split_pdf(uploaded_file):
-    with open("temp.pdf", "wb") as f:
-        f.write(uploaded_file.read())
-    loader = PyPDFLoader("temp.pdf")
-    documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
-    chunks = text_splitter.split_documents(documents)
-    return chunks
-# Build vectorstore
-def build_vectorstore(chunks):
-    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
-    return vectorstore
-# Load Lamini or other HF model
-def get_llm():
-    return HuggingFaceHub(
-        repo_id="lamini/lamini-13b-chat",
-        model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
-    )
-# Create prompt template (optional for better accuracy)
-custom_prompt = PromptTemplate(
-    input_variables=["context", "question"],
-    template="""
-You are a helpful assistant. Use the following context to answer the question as accurately as possible.
-If the answer is not in the context, respond with "Not found in the document."
-Context:
-{context}
-Question: {question}
-Answer:"""
-)
-# Build QA chain
-def build_qa_chain(vectorstore):
-    llm = get_llm()
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
-        chain_type_kwargs={"prompt": custom_prompt}
-    )
-    return qa_chain
-# Streamlit UI
-def main():
-    st.set_page_config(page_title="PDF Chatbot", layout="wide")
-    st.title("Chat with your PDF")
-    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
-    if uploaded_file:
-        st.success("PDF uploaded successfully!")
-        with st.spinner("Processing PDF..."):
-            chunks = load_and_split_pdf(uploaded_file)
-            vectorstore = build_vectorstore(chunks)
-            qa_chain = build_qa_chain(vectorstore)
-        st.success("Ready to chat!")
-        user_question = st.text_input("Ask a question based on the PDF:")
-        if user_question:
-            with st.spinner("Generating answer..."):
-                result = qa_chain.run(user_question)
-                st.markdown("**Answer:**")
-                st.write(result)
-if __name__ == "__main__":
-    main()

+import os
+import streamlit as st
+import fitz  # PyMuPDF
+import logging
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_community.llms import HuggingFacePipeline
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain_community.document_loaders import TextLoader
+# --- Configuration ---
+st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
+st.title("📚 RAG-based PDF Chatbot")
+device = "cpu"
+# --- Logging ---
+logging.basicConfig(level=logging.INFO)
+# --- Load LLM ---
+@st.cache_resource
+def load_model():
+    checkpoint = "MBZUAI/LaMini-T5-738M"
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+    pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
+    return HuggingFacePipeline(pipeline=pipe)
+# --- Extract PDF Text ---
+def read_pdf(file):
+    try:
+        doc = fitz.open(stream=file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text.strip()
+    except Exception as e:
+        logging.error(f"Failed to extract text: {e}")
+        return ""
+# --- Process Answer ---dd
+def process_answer(question, full_text):
+    # Save the full_text to a temporary file
+    with open("temp_text.txt", "w") as f:
+        f.write(full_text)
+    loader = TextLoader("temp_text.txt")
+    docs = loader.load()
+    # Chunk the documents with increased size and overlap
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
+    splits = text_splitter.split_documents(docs)
+    # Load embeddings
+    embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+    # Create Chroma in-memory vector store
+    db = Chroma.from_documents(splits, embedding=embeddings)
+    retriever = db.as_retriever()
+    # Set up the model
+    llm = load_model()
+    # Create a custom prompt
+    prompt_template = PromptTemplate(
+    input_variables=["context", "question"],
+    template="""
+    You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
+    Context:
+    {context}
+    Question:
+    {question}
+    Important Instructions:
+    - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
+    - Do NOT summarize or paraphrase.
+    - If the information is not in the context, say "Not found in the document."
+    Answer:
+    """)
+    # Retrieval QA with custom prompt
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        chain_type_kwargs={"prompt": prompt_template}
+    )
+    # Return the answer using the retrieval QA chain
+    return qa_chain.run(question)
+# --- UI Layout ---
+with st.sidebar:
+    st.header("📄 Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
+# --- Main Interface ---
+if uploaded_file:
+    st.success(f"You uploaded: {uploaded_file.name}")
+    full_text = read_pdf(uploaded_file)
+    if full_text:
+        st.subheader("📁 PDF Preview")
+        with st.expander("View Extracted Text"):
+            st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+        st.subheader("💬 Ask a Question")
+        user_question = st.text_input("Type your question about the PDF content")
+        if user_question:
+            with st.spinner("Thinking..."):
+                answer = process_answer(user_question, full_text)
+                st.markdown("### 🤖 Answer")
+                st.write(answer)
+        with st.sidebar:
+            st.markdown("---")
+            st.markdown("**💡 Suggestions:**")
+            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
+        with st.expander("💡 Suggestions", expanded=True):
+            st.markdown("""
+            - "Summarize this document"
+            - "Give a quick summary"
+            - "What are the main points?"
+            - "Explain this document in short"
+            """)
+    else:
+        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
+else:
+    st.info("Upload a PDF to begin.")
+# import os
+# import streamlit as st
+# from langchain_community.document_loaders import PyPDFLoader
+# from langchain_text_splitters import RecursiveCharacterTextSplitter
+# from langchain_community.embeddings import HuggingFaceEmbeddings
+# from langchain_community.vectorstores import FAISS
+# from langchain.chains import RetrievalQA
+# from langchain.prompts import PromptTemplate
+# from langchain.llms import HuggingFaceHub
+# # Set your Hugging Face API token here
+# os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
+# # Load and split PDF
+# def load_and_split_pdf(uploaded_file):
+#     with open("temp.pdf", "wb") as f:
+#         f.write(uploaded_file.read())
+#     loader = PyPDFLoader("temp.pdf")
+#     documents = loader.load()
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
+#     chunks = text_splitter.split_documents(documents)
+#     return chunks
+# # Build vectorstore
+# def build_vectorstore(chunks):
+#     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+#     vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
+#     return vectorstore
+# # Load Lamini or other HF model
+# def get_llm():
+#     return HuggingFaceHub(
+#         repo_id="lamini/lamini-13b-chat",
+#         model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
+#     )
+# # Create prompt template (optional for better accuracy)
+# custom_prompt = PromptTemplate(
+#     input_variables=["context", "question"],
+#     template="""
+# You are a helpful assistant. Use the following context to answer the question as accurately as possible.
+# If the answer is not in the context, respond with "Not found in the document."
+# Context:
+# {context}
+# Question: {question}
+# Answer:"""
+# )
+# # Build QA chain
+# def build_qa_chain(vectorstore):
+#     llm = get_llm()
+#     qa_chain = RetrievalQA.from_chain_type(
+#         llm=llm,
+#         retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
+#         chain_type_kwargs={"prompt": custom_prompt}
+#     )
+#     return qa_chain
+# # Streamlit UI
+# def main():
+#     st.set_page_config(page_title="PDF Chatbot", layout="wide")
+#     st.title("Chat with your PDF")
+#     uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+#     if uploaded_file:
+#         st.success("PDF uploaded successfully!")
+#         with st.spinner("Processing PDF..."):
+#             chunks = load_and_split_pdf(uploaded_file)
+#             vectorstore = build_vectorstore(chunks)
+#             qa_chain = build_qa_chain(vectorstore)
+#         st.success("Ready to chat!")
+#         user_question = st.text_input("Ask a question based on the PDF:")
+#         if user_question:
+#             with st.spinner("Generating answer..."):
+#                 result = qa_chain.run(user_question)
+#                 st.markdown("**Answer:**")
+#                 st.write(result)
+# if __name__ == "__main__":
+#     main()