|
|
import streamlit as st |
|
|
import os |
|
|
import openai |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import PyPDF2 |
|
|
|
|
|
|
|
|
openai.api_key = os.getenv("openapikey") |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
def load_pdf(uploaded_file): |
|
|
with open("temp.pdf", "wb") as f: |
|
|
f.write(uploaded_file.getvalue()) |
|
|
|
|
|
with open("temp.pdf", 'rb') as file: |
|
|
pdf_reader = PyPDF2.PdfReader(file) |
|
|
text = "" |
|
|
for page_num in range(len(pdf_reader.pages)): |
|
|
page = pdf_reader.pages[page_num] |
|
|
text += page.extract_text() |
|
|
return text |
|
|
|
|
|
def chunk_text(text, chunk_size=500, overlap=100): |
|
|
chunks = [] |
|
|
for i in range(0, len(text), chunk_size - overlap): |
|
|
chunks.append(text[i:i + chunk_size]) |
|
|
return chunks |
|
|
|
|
|
|
|
|
def create_embeddings(chunks): |
|
|
embeddings = model.encode(chunks, convert_to_tensor=True) |
|
|
return embeddings |
|
|
|
|
|
def find_relevant_chunks(query_embedding, chunk_embeddings, chunks, top_k=3): |
|
|
cosine_scores = util.pytorch_cos_sim(query_embedding, chunk_embeddings)[0] |
|
|
top_results = sorted(range(len(cosine_scores)), key=lambda i: cosine_scores[i], reverse=True)[:top_k] |
|
|
relevant_chunks = [chunks[i] for i in top_results] |
|
|
return relevant_chunks |
|
|
|
|
|
def generate_response(query, context): |
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."}, |
|
|
{"role": "user", "content": f"Context: {context}\nQuestion: {query}"} |
|
|
] |
|
|
response = openai.chat.completions.create( |
|
|
model="gpt-3.5-turbo", |
|
|
messages=messages, |
|
|
max_tokens=200 |
|
|
) |
|
|
|
|
|
return response.choices[0].message.content.strip() |
|
|
|
|
|
st.title("Simple RAG Application (No LangChain)") |
|
|
uploaded_file = st.file_uploader("Upload PDF", type="pdf") |
|
|
|
|
|
if uploaded_file: |
|
|
with st.spinner("Processing PDF..."): |
|
|
pdf_text = load_pdf(uploaded_file) |
|
|
chunks = chunk_text(pdf_text) |
|
|
chunk_embeddings = create_embeddings(chunks) |
|
|
|
|
|
query = st.text_input("Ask a question:") |
|
|
if query: |
|
|
query_embedding = model.encode([query], convert_to_tensor=True) |
|
|
relevant_chunks = find_relevant_chunks(query_embedding, chunk_embeddings, chunks) |
|
|
context = "\n".join(relevant_chunks) |
|
|
answer = generate_response(query, context) |
|
|
st.write("Answer:", answer) |