Spaces:

I-AdityaGoyal
/

NoteBot

Sleeping

App Files Files Community

I-AdityaGoyal commited on Jul 20, 2024

Commit

1272fb9

verified ·

1 Parent(s): 2a31972

Upload 8 files

Browse files

Files changed (8) hide show

app.py +93 -0
faiss_indexing.py +20 -0
pdf_generator.py +23 -0
pdf_processing.py +14 -0
requirements.txt +12 -0
text_to_speech.py +6 -0
utils.py +26 -0
youtube_processing.py +16 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import streamlit as st
+import os
+from pdf_processing import extract_text_from_pdf
+from youtube_processing import extract_text_from_youtube
+from faiss_indexing import get_embeddings, create_faiss_index, query_faiss_index
+from utils import load_environment_variables, query_huggingface_api, chunk_text
+from pdf_generator import generate_pdf
+from text_to_speech import speak_text
+from sentence_transformers import SentenceTransformer
+# Load environment variables
+hf_token = load_environment_variables()
+if not hf_token:
+    st.error("Hugging Face API token is missing. Please add it to your .env file.")
+    st.stop()
+# Define the Hugging Face API endpoint
+API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
+headers = {
+    "Authorization": f"Bearer {hf_token}"
+}
+# Initialize the sentence transformer model
+model_name = 'all-MiniLM-L6-v2'
+model = SentenceTransformer(model_name)
+# Streamlit UI
+st.title("NoteBot - Notes Retrieval System")
+st.write("By - Aditya Goyal")
+st.write("Upload PDFs or provide YouTube links to ask questions about their content.")
+uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
+youtube_url = st.text_input("Enter YouTube video URL:")
+all_chunks = []
+# Process PDF files
+if uploaded_files:
+    for uploaded_file in uploaded_files:
+        pdf_path = os.path.join("temp", uploaded_file.name)
+        if not os.path.exists("temp"):
+            os.makedirs("temp")
+        with open(pdf_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        text = extract_text_from_pdf(pdf_path)
+        chunks = chunk_text(text)
+        all_chunks.extend(chunks)
+# Process YouTube video
+if youtube_url:
+    yt_text = extract_text_from_youtube(youtube_url)
+    yt_chunks = chunk_text(yt_text)
+    all_chunks.extend(yt_chunks)
+if all_chunks:
+    embeddings = get_embeddings(all_chunks, model)
+    faiss_index = create_faiss_index(embeddings)
+    query_text = st.text_input("Enter your query:")
+    if query_text:
+        query_embedding = get_embeddings([query_text], model)
+        distances, indices = query_faiss_index(faiss_index, query_embedding)
+        similar_chunks = [all_chunks[i] for i in indices[0]]
+        # Ensure we only use a manageable number of chunks
+        num_chunks_to_use = min(5, len(similar_chunks))
+        selected_chunks = similar_chunks[:num_chunks_to_use]
+        template = """Based on the following chunks: {similar_chunks}
+        Question: {question}
+        Answer:"""
+        prompt_text = template.format(similar_chunks="\n".join(selected_chunks), question=query_text)
+        # Generate response from Hugging Face API
+        response = query_huggingface_api(prompt_text, API_URL, headers)
+        if "Error" not in response:
+            st.write("**Answer:**", response)
+            # Add button to download response as PDF
+            if st.button("Download Response as PDF"):
+                pdf_path = os.path.join("temp", "response.pdf")
+                generate_pdf(response, pdf_path)
+                with open(pdf_path, "rb") as f:
+                    st.download_button(label="Download PDF", data=f, file_name="response.pdf")
+            # Add button to speak the response text
+            if st.button("Speak Response"):
+                speak_text(response)
+        else:
+            st.error(response)

faiss_indexing.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+def get_embeddings(texts, model):
+    embeddings = model.encode(texts, convert_to_tensor=True)
+    return embeddings
+def create_faiss_index(embeddings):
+    embeddings_np = embeddings.cpu().numpy()  # Move to CPU and convert to numpy
+    dim = embeddings_np.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    faiss_index = faiss.IndexIDMap(index)
+    faiss_index.add_with_ids(embeddings_np, np.arange(len(embeddings_np)))
+    return faiss_index
+def query_faiss_index(index, query_embedding, k=5):
+    query_embedding_np = query_embedding.cpu().numpy()  # Move to CPU and convert to numpy
+    distances, indices = index.search(query_embedding_np, k)
+    return distances, indices

pdf_generator.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from fpdf import FPDF
+class PDF(FPDF):
+    def header(self):
+        self.set_font('Arial', 'B', 12)
+        self.cell(0, 10, 'NoteBot Response', 0, 1, 'C')
+    def chapter_title(self, title):
+        self.set_font('Arial', 'B', 12)
+        self.cell(0, 10, title, 0, 1, 'L')
+        self.ln(10)
+    def chapter_body(self, body):
+        self.set_font('Arial', '', 12)
+        self.multi_cell(0, 10, body)
+        self.ln()
+def generate_pdf(text, path):
+    pdf = PDF()
+    pdf.add_page()
+    pdf.chapter_title('Response:')
+    pdf.chapter_body(text)
+    pdf.output(path)

pdf_processing.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import fitz  # PyMuPDF
+def extract_text_from_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)
+        text = ""
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            text += page.get_text()
+        pdf_document.close()
+        return text
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+        return ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit
+PyMuPDF
+numpy
+faiss-cpu
+sentence-transformers
+python-dotenv
+requests
+langchain
+youtube-transcript-api
+speechrecognition
+fpdf
+pyttsx3

text_to_speech.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import pyttsx3
+def speak_text(text):
+    engine = pyttsx3.init()
+    engine.say(text)
+    engine.runAndWait()

utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from dotenv import load_dotenv
+def load_environment_variables():
+    load_dotenv()
+    hf_token = os.getenv("HF_TOKEN")
+    return hf_token
+def query_huggingface_api(prompt, api_url, headers):
+    import requests
+    response = requests.post(api_url, headers=headers, json={"inputs": prompt})
+    if response.status_code == 200:
+        generated_text = response.json()[0]['generated_text']
+        # Extract only the final answer
+        answer_start = generated_text.find("Answer: ")
+        if answer_start != -1:
+            answer = generated_text[answer_start + len("Answer: "):].strip()
+        else:
+            answer = generated_text
+        return answer
+    else:
+        return f"Error {response.status_code}: {response.text}"
+def chunk_text(text, chunk_size=1000):
+    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    return chunks

youtube_processing.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from youtube_transcript_api import YouTubeTranscriptApi
+import re
+def extract_text_from_youtube(video_url):
+    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
+    if not video_id:
+        return ""
+    video_id = video_id.group(0)
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(video_id)
+        text = " ".join([item['text'] for item in transcript])
+        return text
+    except Exception as e:
+        print(f"Error fetching transcript: {e}")
+        return ""