| | import streamlit as st |
| | from PyPDF2 import PdfReader |
| | from transformers import AutoTokenizer, AutoModel |
| | import torch |
| | import faiss |
| | import numpy as np |
| | import os |
| | import requests |
| |
|
| | |
| | GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
| | GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions" |
| | EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" |
| |
|
| | |
| | def read_pdf(file): |
| | pdf = PdfReader(file) |
| | text = "" |
| | for page in pdf.pages: |
| | text += page.extract_text() + "\n" |
| | return text |
| |
|
| | |
| | def chunk_text(text, chunk_size=500, overlap=50): |
| | words = text.split() |
| | chunks = [] |
| | start = 0 |
| | while start < len(words): |
| | end = start + chunk_size |
| | chunk = " ".join(words[start:end]) |
| | chunks.append(chunk) |
| | start += chunk_size - overlap |
| | return chunks |
| |
|
| | |
| | @st.cache_resource |
| | def load_embedding_model(): |
| | tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME) |
| | model = AutoModel.from_pretrained(EMBED_MODEL_NAME) |
| | return tokenizer, model |
| |
|
| | def get_embeddings(text_chunks, tokenizer, model): |
| | embeddings = [] |
| | for chunk in text_chunks: |
| | inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | emb = outputs.last_hidden_state[:, 0, :].numpy()[0] |
| | embeddings.append(emb) |
| | return np.array(embeddings) |
| |
|
| | |
| | def build_faiss_index(embeddings): |
| | dimension = embeddings.shape[1] |
| | index = faiss.IndexFlatL2(dimension) |
| | index.add(embeddings) |
| | return index |
| |
|
| | def search_index(index, query, tokenizer, model, chunks, top_k=3): |
| | inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | query_emb = outputs.last_hidden_state[:, 0, :].numpy() |
| | distances, indices = index.search(query_emb, top_k) |
| | return [chunks[i] for i in indices[0]] |
| |
|
| | |
| | def query_groq(context, question): |
| | prompt = f"""You are a helpful engineering assistant. Use the following context to answer the question. |
| | |
| | Context: |
| | {context} |
| | |
| | Question: |
| | {question} |
| | """ |
| | headers = { |
| | "Authorization": f"Bearer {GROQ_API_KEY}", |
| | "Content-Type": "application/json" |
| | } |
| | payload = { |
| | "model": "llama3-8b-8192", |
| | "messages": [ |
| | {"role": "system", "content": "You are a helpful engineering tutor."}, |
| | {"role": "user", "content": prompt} |
| | ], |
| | "temperature": 0.3, |
| | "max_tokens": 512 |
| | } |
| | response = requests.post(GROQ_API_URL, headers=headers, json=payload) |
| | response.raise_for_status() |
| | return response.json()["choices"][0]["message"]["content"] |
| |
|
| | |
| | st.title("📚 engGlass RAG Assistant") |
| | st.write("Upload a PDF, ask engineering questions, and get smart answers!") |
| |
|
| | uploaded_file = st.file_uploader("Upload PDF", type="pdf") |
| | question = st.text_input("Ask a question based on the uploaded document:") |
| |
|
| | if uploaded_file and question: |
| | with st.spinner("Reading and processing PDF..."): |
| | text = read_pdf(uploaded_file) |
| | chunks = chunk_text(text) |
| | tokenizer, model = load_embedding_model() |
| | embeddings = get_embeddings(chunks, tokenizer, model) |
| | index = build_faiss_index(embeddings) |
| | top_chunks = search_index(index, question, tokenizer, model, chunks) |
| | context = "\n".join(top_chunks) |
| |
|
| | with st.spinner("Generating answer from Groq..."): |
| | try: |
| | answer = query_groq(context, question) |
| | st.markdown("### 💡 Answer") |
| | st.write(answer) |
| | except Exception as e: |
| | st.error(f"Error: {str(e)}") |
| |
|