NHZ commited on
Commit
f2ab7e6
·
verified ·
1 Parent(s): 0519d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -67
app.py CHANGED
@@ -1,86 +1,69 @@
1
  import os
2
- import requests
 
3
  import numpy as np
4
- import faiss
5
- from PyPDF2 import PdfReader
6
- from transformers import AutoTokenizer, AutoModel
7
  from langchain.vectorstores import FAISS
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain.chains import RetrievalQA
 
 
10
  from langchain.prompts import PromptTemplate
11
- from langchain.chat_models import ChatOpenAI
12
- from groq import Groq
13
  import streamlit as st
14
 
15
- # Initialize Groq client
16
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
 
17
 
18
- # Function to download and extract content from a public Google Drive PDF link
19
- def extract_pdf_content(drive_url):
20
- # Extract file ID from the Google Drive URL
21
- file_id = drive_url.split("/d/")[1].split("/view")[0]
22
- download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
23
 
24
- # Download the PDF content
25
- response = requests.get(download_url)
26
- if response.status_code != 200:
27
- return None
28
 
29
- # Save and extract text from the PDF
30
- with open("document.pdf", "wb") as f:
31
- f.write(response.content)
32
- reader = PdfReader("document.pdf")
33
- text = ""
34
- for page in reader.pages:
35
- text += page.extract_text()
36
- return text
37
 
38
- # Streamlit app
39
- st.title("Enhanced RAG with LangChain and Groq API")
40
 
41
- # Predefined Google Drive link
42
- drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
43
 
44
- # Extract document content
45
- st.write("Extracting content from the document...")
46
- text = extract_pdf_content(drive_url)
47
- if text:
48
- st.write("Document extracted successfully!")
49
-
50
- # LangChain embeddings and FAISS index setup
51
- st.write("Building embeddings and FAISS index...")
52
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
53
- faiss_index = FAISS.from_texts([text], embeddings)
54
-
55
- # LangChain retriever
56
- retriever = faiss_index.as_retriever(search_kwargs={"k": 3})
57
 
58
- # LangChain QA chain
59
- prompt_template = """
60
- Use the following document excerpts to answer the user's question.
61
- If the answer is not directly found in the document, say "The answer is not in the provided document.".
62
 
63
- Document Excerpts:
64
- {context}
65
 
66
- Question:
67
- {question}
 
68
 
69
- Answer:
70
- """
71
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
72
- qa_chain = RetrievalQA.from_chain_type(
73
- llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
74
- retriever=retriever,
75
- chain_type_kwargs={"prompt": PROMPT},
76
- )
77
 
78
- # Query input
79
- query = st.text_input("Enter your query:")
80
- if query:
81
- st.write("Searching the document and generating a response...")
82
- result = qa_chain.run(query)
83
- st.write("Response:", result)
84
- else:
85
- st.error("Failed to extract content from the document.")
86
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
+ import torch
4
  import numpy as np
 
 
 
5
  from langchain.vectorstores import FAISS
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
+ from langchain.llms import HuggingFaceHub
 
12
  import streamlit as st
13
 
14
+ # Environment setup
15
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
+ if not HUGGINGFACEHUB_API_TOKEN:
17
+ raise ValueError("HuggingFace API Token is missing.")
18
 
19
+ # Initialize HuggingFace embeddings model
20
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
21
 
22
+ # Load PDF document from Google Drive
23
+ pdf_url = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0"
24
+ loader = PyPDFLoader(pdf_url)
25
+ documents = loader.load()
26
 
27
+ # Split text into chunks
28
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
29
+ texts = text_splitter.split_documents(documents)
 
 
 
 
 
30
 
31
+ # Create FAISS vector database
32
+ db = FAISS.from_documents(texts, embeddings)
33
 
34
+ # Initialize HuggingFace LLM (example model, replace as needed)
35
+ llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature": 0, "max_length": 512})
36
 
37
+ # Define custom prompt
38
+ prompt_template = """
39
+ Use the following pieces of context to answer the question at the end.
40
+ If the question cannot be answered based on the context, say "I don't know."
 
 
 
 
 
 
 
 
 
41
 
42
+ Context:
43
+ {context}
 
 
44
 
45
+ Question:
46
+ {question}
47
 
48
+ Answer:
49
+ """
50
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
51
 
52
+ # Load QA chain
53
+ qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
 
 
 
 
 
 
54
 
55
+ # Streamlit frontend
56
+ st.title("RAG-based Document Q&A")
57
+ st.write("Upload a document and ask questions about it.")
 
 
 
 
 
58
 
59
+ query = st.text_input("Enter your question:")
60
+ if query:
61
+ # Search vector database
62
+ docs = db.similarity_search(query, k=4)
63
+
64
+ # Get relevant context
65
+ context = "\n\n".join([doc.page_content for doc in docs])
66
+
67
+ # Generate answer using LLM
68
+ answer = qa_chain.run({"context": context, "question": query})
69
+ st.write("**Answer:**", answer)