Spaces:

FridayMaster
/

CHATBOT1

Sleeping

App Files Files Community

FridayMaster commited on Aug 11, 2024

Commit

929a283

verified ·

1 Parent(s): aca97ad

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -70

app.py CHANGED Viewed

@@ -1,78 +1,106 @@
-import os
-import pandas as pd
-import PyPDF2
-import spacy
 import faiss
 from sentence_transformers import SentenceTransformer
-import torch
-import gradio as gr
-# Load and preprocess PDF text
-def extract_text_from_pdf(pdf_path):
-    text = ""
-    with open(pdf_path, 'rb') as pdf_file:
-        pdf_reader = PyPDF2.PdfReader(pdf_file)
-        for page_num in range(len(pdf_reader.pages)):
-            page = pdf_reader.pages[page_num]
-            text += page.extract_text()
-    return text
-# Path to your PDF file
-pdf_path = 'FridayMaster/UBANTUMANUAL/Getting Started with Ubuntu 16.04.pdf'
-# Extract text from the PDF
-pdf_text = extract_text_from_pdf(pdf_path)
-# Convert the text to a DataFrame
-df = pd.DataFrame({'text': [pdf_text]})
-# Load the custom embedding model
-class CustomEmbeddingModel:
-    def __init__(self, model_name):
-        self.model = SentenceTransformer(model_name)
-    def embed_text(self, text):
-        return self.model.encode(text, convert_to_tensor=True)
-embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name
-# Load Spacy model for preprocessing
-nlp = spacy.load("en_core_web_sm")
-def preprocess_text(text):
-    doc = nlp(text)
-    tokens = [token.lemma_.lower() for token in doc if token.is_alpha]
-    return ' '.join(tokens)
-# Apply preprocessing and embedding
-df['text'] = df['text'].apply(preprocess_text)
-df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
-# Create a FAISS index
-index = faiss.IndexFlatL2(768)  # Assuming embeddings are 768-dimensional
-embeddings = torch.stack(df['text_embeddings'].tolist())
-faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
-faiss_index.add(embeddings.numpy())
-# Function to generate a response
-def generate_response(prompt):
-    query_embedding = embedding_model.embed_text(prompt).unsqueeze(0)
-    distances, indices = faiss_index.search(query_embedding.numpy(), k=1)
-    response = df.iloc[indices[0][0]]['text']
-    return response
-# Gradio interface
 iface = gr.Interface(
-    fn=generate_response,
-    inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
-    outputs=gr.Textbox(label="Response"),
-    title="Ubuntu Manual Chatbot",
-    description="Ask questions about the Ubuntu manual."
 )
 if __name__ == "__main__":
     iface.launch()

+import gradio as gr
 import faiss
+import numpy as np
+import openai
 from sentence_transformers import SentenceTransformer
+from nltk.tokenize import sent_tokenize
+# Load the Ubuntu manual from a .txt file
+with open("/content/ubuntu_manual.txt", "r", encoding="utf-8") as file:
+    full_text = file.read()
+# Function to chunk the text into smaller pieces
+def chunk_text(text, chunk_size=500):  # Larger chunks
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence.split()) <= chunk_size:
+            current_chunk.append(sentence)
+        else:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentence]
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+# Apply chunking to the entire text
+manual_chunks = chunk_text(full_text, chunk_size=500)
+# Load your FAISS index
+index = faiss.read_index("path/to/your/faiss_index.bin")
+# Load your embedding model
+embedding_model = SentenceTransformer('your_embedding_model_name')
+# OpenAI API key
+openai.api_key = 'your-openai-api-key'
+# Function to create embeddings
+def embed_text(text_list):
+    return np.array(embedding_model.encode(text_list), dtype=np.float32)
+# Function to retrieve relevant chunks for a user query
+def retrieve_chunks(query, k=5):
+    query_embedding = embed_text([query])
+    # Search the FAISS index
+    distances, indices = index.search(query_embedding, k=k)
+    # Debugging: Print out the distances and indices
+    print("Distances:", distances)
+    print("Indices:", indices)
+    # Check if indices are valid
+    if len(indices[0]) == 0:
+        return []
+    # Ensure indices are within bounds
+    valid_indices = [i for i in indices[0] if i < len(manual_chunks)]
+    if not valid_indices:
+        return []
+    # Retrieve relevant chunks
+    relevant_chunks = [manual_chunks[i] for i in valid_indices]
+    return relevant_chunks
+# Function to truncate long inputs
+def truncate_input(text, max_length=512):
+    tokens = generator_tokenizer.encode(text, truncation=True, max_length=max_length, return_tensors="pt")
+    return tokens
+# Function to perform RAG: Retrieve chunks and generate a response
+def rag_response(query, k=5, max_new_tokens=150):
+    # Step 1: Retrieve relevant chunks
+    relevant_chunks = retrieve_chunks(query, k=k)
+    if not relevant_chunks:
+        return "Sorry, I couldn't find relevant information."
+    # Step 2: Combine the query with retrieved chunks
+    augmented_input = query + "\n" + "\n".join(relevant_chunks)
+    # Truncate and encode the input
+    inputs = truncate_input(augmented_input)
+    # Generate response
+    outputs = generator_model.generate(inputs, max_new_tokens=max_new_tokens)
+    generated_text = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return generated_text
+# Gradio Interface
 iface = gr.Interface(
+    fn=rag_response,
+    inputs="text",
+    outputs="text",
+    title="RAG Chatbot with FAISS and GPT-3.5",
+    description="Ask me anything!"
 )
 if __name__ == "__main__":
     iface.launch()