|
|
import os |
|
|
import gradio as gr |
|
|
import faiss |
|
|
import numpy as np |
|
|
import PyPDF2 |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import requests |
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") |
|
|
GROQ_MODEL = "llama3-8b-8192" |
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
dimension = 384 |
|
|
index = faiss.IndexFlatL2(dimension) |
|
|
text_chunks = [] |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
reader = PyPDF2.PdfReader(pdf_file) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() or "" |
|
|
return text |
|
|
|
|
|
def chunk_text(text, chunk_size=500): |
|
|
sentences = text.split('. ') |
|
|
chunks, chunk = [], "" |
|
|
for sentence in sentences: |
|
|
if len(chunk) + len(sentence) < chunk_size: |
|
|
chunk += sentence + ". " |
|
|
else: |
|
|
chunks.append(chunk.strip()) |
|
|
chunk = sentence + ". " |
|
|
if chunk: |
|
|
chunks.append(chunk.strip()) |
|
|
return chunks |
|
|
|
|
|
def embed_and_store(chunks): |
|
|
global text_chunks |
|
|
text_chunks = chunks |
|
|
embeddings = embedding_model.encode(chunks) |
|
|
index.add(np.array(embeddings)) |
|
|
|
|
|
def retrieve_context(query, top_k=3): |
|
|
query_vector = embedding_model.encode([query]) |
|
|
distances, indices = index.search(np.array(query_vector), top_k) |
|
|
return "\n".join([text_chunks[i] for i in indices[0]]) |
|
|
|
|
|
def format_prompt(context, question): |
|
|
system_msg = "You are a helpful research assistant who answers questions using only the uploaded document." |
|
|
user_msg = f"Document Context:\n{context}\n\nQuestion: {question}\nAnswer:" |
|
|
return [{"role": "system", "content": system_msg}, |
|
|
{"role": "user", "content": user_msg}] |
|
|
|
|
|
def call_groq_api(messages): |
|
|
url = "https://api.groq.com/openai/v1/chat/completions" |
|
|
headers = { |
|
|
"Authorization": f"Bearer {GROQ_API_KEY}", |
|
|
"Content-Type": "application/json" |
|
|
} |
|
|
data = { |
|
|
"model": GROQ_MODEL, |
|
|
"messages": messages, |
|
|
"temperature": 0.3 |
|
|
} |
|
|
response = requests.post(url, headers=headers, json=data) |
|
|
return response.json()['choices'][0]['message']['content'] |
|
|
|
|
|
def upload_file(pdf): |
|
|
text = extract_text_from_pdf(pdf) |
|
|
chunks = chunk_text(text) |
|
|
embed_and_store(chunks) |
|
|
return "β
Document processed. You may now ask questions." |
|
|
|
|
|
def answer_question(question): |
|
|
if not text_chunks: |
|
|
return "β Please upload and process a document first." |
|
|
context = retrieve_context(question) |
|
|
messages = format_prompt(context, question) |
|
|
return call_groq_api(messages) |
|
|
|
|
|
with gr.Blocks() as rag_ui: |
|
|
gr.Markdown("## π RAG Assistant with LLaMA3 (Groq)") |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_input = gr.File(label="Upload PDF") |
|
|
upload_button = gr.Button("Process Document") |
|
|
|
|
|
status_output = gr.Textbox(label="Status") |
|
|
upload_button.click(upload_file, inputs=pdf_input, outputs=status_output) |
|
|
|
|
|
gr.Markdown("### β Ask a Question from the Uploaded PDF") |
|
|
question_input = gr.Textbox(label="Your Question") |
|
|
answer_output = gr.Textbox(label="Answer", lines=5) |
|
|
|
|
|
ask_button = gr.Button("Get Answer") |
|
|
ask_button.click(answer_question, inputs=question_input, outputs=answer_output) |
|
|
|
|
|
rag_ui.launch() |
|
|
|