Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import streamlit as st | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| import docx | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| import numpy as np | |
| # Load sentence transformer model | |
| model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
| # Initialize Groq client | |
| client = Groq(api_key=os.environ['GROQ_KEY']) | |
| # OCR extraction from PDF | |
| def extract_text_from_pdf(file_path): | |
| images = convert_from_path(file_path, poppler_path='/usr/bin') # Make sure Poppler path is correct | |
| text = "" | |
| for image in images: | |
| text += pytesseract.image_to_string(image, lang='urd') + "\n" | |
| return text | |
| # Text extraction from Word documents | |
| def extract_text_from_docx(file_path): | |
| doc = docx.Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| # Chunking text | |
| def chunk_text(text, max_length=500): | |
| sentences = text.split("\n") | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= max_length: | |
| current_chunk += sentence + " " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Generate embeddings | |
| def generate_embeddings(chunks): | |
| return model.encode(chunks) | |
| # Store in FAISS index | |
| def create_faiss_index(embeddings): | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| return index | |
| # Search in FAISS | |
| def search_index(index, query, chunks, k=3): | |
| query_embedding = model.encode([query]) | |
| distances, indices = index.search(query_embedding, k) | |
| return [chunks[i] for i in indices[0]] | |
| # Groq query function with Urdu system prompt | |
| def query_groq(query, context): | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "آپ ایک مددگار اسسٹنٹ ہیں جو ہمیشہ اردو میں جواب دیتا ہے، چاہے سوال اردو یا انگریزی میں ہو۔ " | |
| "براہ کرم نیچے دیے گئے سیاق و سباق اور سوال کی بنیاد پر اردو میں تفصیلی جواب دیں۔" | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"سیاق و سباق:\n{context}\n\nسوال:\n{query}", | |
| }, | |
| ] | |
| chat_completion = client.chat.completions.create( | |
| messages=messages, | |
| model="llama3-8b-8192", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit UI | |
| st.title("📚 اردو ڈاکیومنٹس کے لیے RAG ایپ") | |
| uploaded_file = st.file_uploader("پی ڈی ایف یا ورڈ فائل اپلوڈ کریں", type=["pdf", "docx"]) | |
| if uploaded_file: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix="." + uploaded_file.name.split('.')[-1]) as tmp: | |
| tmp.write(uploaded_file.read()) | |
| tmp_path = tmp.name | |
| if uploaded_file.name.endswith(".pdf"): | |
| text = extract_text_from_pdf(tmp_path) | |
| else: | |
| text = extract_text_from_docx(tmp_path) | |
| chunks = chunk_text(text) | |
| embeddings = generate_embeddings(chunks) | |
| index = create_faiss_index(np.array(embeddings)) | |
| st.success("ڈاکیومنٹ پروسیس ہو گیا ہے۔ اب سوال پوچھیں۔") | |
| query = st.text_input("سوال درج کریں") | |
| if query: | |
| top_chunks = search_index(index, query, chunks) | |
| context = "\n".join(top_chunks) | |
| answer = query_groq(query, context) | |
| st.markdown("### جواب:") | |
| st.write(answer) | |