import os import tempfile import streamlit as st from pdf2image import convert_from_path from PIL import Image import pytesseract import docx import faiss from sentence_transformers import SentenceTransformer from groq import Groq import numpy as np # Load sentence transformer model model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # Initialize Groq client client = Groq(api_key=os.environ['GROQ_KEY']) # OCR extraction from PDF def extract_text_from_pdf(file_path): images = convert_from_path(file_path, poppler_path='/usr/bin') # Make sure Poppler path is correct text = "" for image in images: text += pytesseract.image_to_string(image, lang='urd') + "\n" return text # Text extraction from Word documents def extract_text_from_docx(file_path): doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) # Chunking text def chunk_text(text, max_length=500): sentences = text.split("\n") chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= max_length: current_chunk += sentence + " " else: chunks.append(current_chunk.strip()) current_chunk = sentence + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks # Generate embeddings def generate_embeddings(chunks): return model.encode(chunks) # Store in FAISS index def create_faiss_index(embeddings): dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(embeddings) return index # Search in FAISS def search_index(index, query, chunks, k=3): query_embedding = model.encode([query]) distances, indices = index.search(query_embedding, k) return [chunks[i] for i in indices[0]] # Groq query function with Urdu system prompt def query_groq(query, context): messages = [ { "role": "system", "content": ( "آپ ایک مددگار اسسٹنٹ ہیں جو ہمیشہ اردو میں جواب دیتا ہے، چاہے سوال اردو یا انگریزی میں ہو۔ " "براہ کرم نیچے دیے گئے سیاق و سباق اور سوال کی بنیاد پر اردو میں تفصیلی جواب دیں۔" ), }, { "role": "user", "content": f"سیاق و سباق:\n{context}\n\nسوال:\n{query}", }, ] chat_completion = client.chat.completions.create( messages=messages, model="llama3-8b-8192", ) return chat_completion.choices[0].message.content # Streamlit UI st.title("📚 اردو ڈاکیومنٹس کے لیے RAG ایپ") uploaded_file = st.file_uploader("پی ڈی ایف یا ورڈ فائل اپلوڈ کریں", type=["pdf", "docx"]) if uploaded_file: with tempfile.NamedTemporaryFile(delete=False, suffix="." + uploaded_file.name.split('.')[-1]) as tmp: tmp.write(uploaded_file.read()) tmp_path = tmp.name if uploaded_file.name.endswith(".pdf"): text = extract_text_from_pdf(tmp_path) else: text = extract_text_from_docx(tmp_path) chunks = chunk_text(text) embeddings = generate_embeddings(chunks) index = create_faiss_index(np.array(embeddings)) st.success("ڈاکیومنٹ پروسیس ہو گیا ہے۔ اب سوال پوچھیں۔") query = st.text_input("سوال درج کریں") if query: top_chunks = search_index(index, query, chunks) context = "\n".join(top_chunks) answer = query_groq(query, context) st.markdown("### جواب:") st.write(answer)