Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

NHZ commited on Jan 4, 2025

Commit

f2ab7e6

verified ·

1 Parent(s): 0519d7a

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -67

app.py CHANGED Viewed

@@ -1,86 +1,69 @@
 import os
-import requests
 import numpy as np
-import faiss
-from PyPDF2 import PdfReader
-from transformers import AutoTokenizer, AutoModel
 from langchain.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
-from langchain.chat_models import ChatOpenAI
-from groq import Groq
 import streamlit as st
-# Initialize Groq client
-client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-# Function to download and extract content from a public Google Drive PDF link
-def extract_pdf_content(drive_url):
-    # Extract file ID from the Google Drive URL
-    file_id = drive_url.split("/d/")[1].split("/view")[0]
-    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    # Download the PDF content
-    response = requests.get(download_url)
-    if response.status_code != 200:
-        return None
-    # Save and extract text from the PDF
-    with open("document.pdf", "wb") as f:
-        f.write(response.content)
-    reader = PdfReader("document.pdf")
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text()
-    return text
-# Streamlit app
-st.title("Enhanced RAG with LangChain and Groq API")
-# Predefined Google Drive link
-drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
-# Extract document content
-st.write("Extracting content from the document...")
-text = extract_pdf_content(drive_url)
-if text:
-    st.write("Document extracted successfully!")
-    # LangChain embeddings and FAISS index setup
-    st.write("Building embeddings and FAISS index...")
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    faiss_index = FAISS.from_texts([text], embeddings)
-    # LangChain retriever
-    retriever = faiss_index.as_retriever(search_kwargs={"k": 3})
-    # LangChain QA chain
-    prompt_template = """
-    Use the following document excerpts to answer the user's question.
-    If the answer is not directly found in the document, say "The answer is not in the provided document.".
-    Document Excerpts:
-    {context}
-    Question:
-    {question}
-    Answer:
-    """
-    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
-        retriever=retriever,
-        chain_type_kwargs={"prompt": PROMPT},
-    )
-    # Query input
-    query = st.text_input("Enter your query:")
-    if query:
-        st.write("Searching the document and generating a response...")
-        result = qa_chain.run(query)
-        st.write("Response:", result)
-else:
-    st.error("Failed to extract content from the document.")

 import os
+import re
+import torch
 import numpy as np
 from langchain.vectorstores import FAISS
 from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
+from langchain.llms import HuggingFaceHub
 import streamlit as st
+# Environment setup
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+if not HUGGINGFACEHUB_API_TOKEN:
+    raise ValueError("HuggingFace API Token is missing.")
+# Initialize HuggingFace embeddings model
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Load PDF document from Google Drive
+pdf_url = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0"
+loader = PyPDFLoader(pdf_url)
+documents = loader.load()
+# Split text into chunks
+text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+texts = text_splitter.split_documents(documents)
+# Create FAISS vector database
+db = FAISS.from_documents(texts, embeddings)
+# Initialize HuggingFace LLM (example model, replace as needed)
+llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature": 0, "max_length": 512})
+# Define custom prompt
+prompt_template = """
+Use the following pieces of context to answer the question at the end.
+If the question cannot be answered based on the context, say "I don't know."
+Context:
+{context}
+Question:
+{question}
+Answer:
+"""
+prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+# Load QA chain
+qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
+# Streamlit frontend
+st.title("RAG-based Document Q&A")
+st.write("Upload a document and ask questions about it.")
+query = st.text_input("Enter your question:")
+if query:
+    # Search vector database
+    docs = db.similarity_search(query, k=4)
+    # Get relevant context
+    context = "\n\n".join([doc.page_content for doc in docs])
+    # Generate answer using LLM
+    answer = qa_chain.run({"context": context, "question": query})
+    st.write("**Answer:**", answer)