Spaces:

SumbalFatima1122
/

raag_based_application_with_GoogleDrivelink

Sleeping

App Files Files Community

SumbalFatima1122 commited on Dec 31, 2024

Commit

37f721f

verified ·

1 Parent(s): 7b39b9b

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -93

app.py CHANGED Viewed

@@ -1,103 +1,91 @@
-# Install necessary libraries
-# Set up API key
 import os
-os.environ['GROQ_API_KEY'] = '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'
-# Download and process PDFs from public Google Drive links
-import requests
-import pdfplumber
-def download_from_drive(link):
     file_id = link.split('/d/')[1].split('/')[0]
-    download_url = f"https://drive.google.com/uc?id={file_id}&export=download"
     response = requests.get(download_url)
     if response.status_code == 200:
-        file_path = f"{file_id}.pdf"
-        with open(file_path, 'wb') as f:
-            f.write(response.content)
-        print(f"PDF downloaded successfully: {file_path}")
-        return file_path
     else:
-        raise Exception("Failed to download file. Please check the link.")
-def extract_text_from_pdf(file_path):
-    try:
-        with pdfplumber.open(file_path) as pdf:
-            text = ''.join(page.extract_text() for page in pdf.pages)
-        print(f"Extracted text length: {len(text)}")
-        return text
-    except Exception as e:
-        print(f"Error extracting text: {e}")
-        return ""
-# Preprocess documents into chunks
-from langchain.text_splitter import CharacterTextSplitter
-def preprocess_document(content):
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    return text_splitter.split_text(content)
-# Generate embeddings
-from sentence_transformers import SentenceTransformer
-def generate_embeddings(text_chunks):
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    return [model.encode(chunk) for chunk in text_chunks]
-# Store embeddings in FAISS
-import faiss
-import numpy as np
-def create_faiss_index(embeddings):
-    dimension = len(embeddings[0])
-    index = faiss.IndexFlatL2(dimension)
-    index.add(np.array(embeddings))
-    faiss.write_index(index, "faiss_index.index")
-    print("Embeddings stored in FAISS.")
-# Query the Groq model
-from groq import Groq
-client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-def query_model(prompt):
-    response = client.chat.completions.create(
-        messages=[{"role": "user", "content": prompt}],
-        model="llama3-8b-8192"
     )
-    return response.choices[0].message.content
-# Streamlit Frontend
-import streamlit as st
-st.title("RAG Application with Google Drive Documents")
-doc_links = [
-    "https://drive.google.com/file/d/1zoo4-GNIGPtbT_Yb4nIZw-qYf8Wj57nP/view?usp=sharing"
-]
-query = st.text_input("Enter your query:")
-if query:
-    all_chunks = []
-    for link in doc_links:
-        try:
-            file_path = download_from_drive(link)
-            extracted_text = extract_text_from_pdf(file_path)
-            if extracted_text.strip():
-                text_chunks = preprocess_document(extracted_text)
-                embeddings = generate_embeddings(text_chunks)
-                create_faiss_index(embeddings)
-                all_chunks.extend(text_chunks)
-            else:
-                st.error("Failed to extract text from the document.")
-        except Exception as e:
-            st.error(f"Error processing document: {e}")
-    if all_chunks:
-        result = query_model(query)
-        st.write(result)
     else:
-        st.error("No valid data to process. Please check your document links.")

+# Required module installations (uncomment and run in your environment if needed)
+# !pip install requests PyPDF2 langchain faiss-cpu streamlit groq sentence-transformers
+import requests
+import io
+import PyPDF2
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings  # Open-source embedding model
+from langchain.vectorstores import FAISS
+from groq import Groq
 import os
+import streamlit as st
+# Set up Groq API
+os.environ["GROQ_API_KEY"] = "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"  # Replace with your Groq API key
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Function to download PDF from Google Drive link
+def download_pdf_from_link(link):
     file_id = link.split('/d/')[1].split('/')[0]
+    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
     response = requests.get(download_url)
     if response.status_code == 200:
+        return response.content
     else:
+        raise Exception("Failed to download file. Check the link.")
+# Function to extract text from PDF
+def read_pdf(pdf_content):
+    file_io = io.BytesIO(pdf_content)
+    pdf_reader = PyPDF2.PdfReader(file_io)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Function to create chunks of text
+def create_chunks(documents):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    chunks = []
+    for doc in documents:
+        chunks.extend(text_splitter.split_text(doc))
+    return chunks
+# Function to query the Groq API with vectorstore
+def query_with_groq(query, vectorstore):
+    docs = vectorstore.similarity_search(query, k=3)
+    context = " ".join([doc.page_content for doc in docs])
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": f"{context}\n\n{query}"}
+        ],
+        model="llama3-8b-8192",
     )
+    return chat_completion.choices[0].message.content
+# Main function to initialize the app
+def main():
+    st.title("RAG Application with Google Drive Links")
+    # Input links (replace these with your document links)
+    links = [
+        "https://drive.google.com/file/d/1zoo4-GNIGPtbT_Yb4nIZw-qYf8Wj57nP/view?usp=sharing"
+        # Add more links here if needed
+    ]
+    # Load or process documents
+    if "vectorstore" not in st.session_state:
+        documents = [read_pdf(download_pdf_from_link(link)) for link in links]
+        chunks = create_chunks(documents)
+        # Generate embeddings and store in FAISS
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        vectorstore = FAISS.from_texts(chunks, embeddings)
+        vectorstore.save_local("faiss_index")
+        st.session_state.vectorstore = vectorstore
     else:
+        vectorstore = st.session_state.vectorstore
+    # Query input from user
+    query = st.text_input("Enter your query:")
+    if query:
+        response = query_with_groq(query, vectorstore)
+        st.write("Response:")
+        st.write(response)
+# Run the app
+if __name__ == "__main__":
+    main()