Rag_App_Doc / app.py
asagasad's picture
Update app.py
67960d3 verified
import os
import tempfile
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import docx
import faiss
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np
# Load sentence transformer model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# Initialize Groq client
client = Groq(api_key=os.environ['GROQ_KEY'])
# OCR extraction from PDF
def extract_text_from_pdf(file_path):
images = convert_from_path(file_path, poppler_path='/usr/bin') # Make sure Poppler path is correct
text = ""
for image in images:
text += pytesseract.image_to_string(image, lang='urd') + "\n"
return text
# Text extraction from Word documents
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
# Chunking text
def chunk_text(text, max_length=500):
sentences = text.split("\n")
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_length:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Generate embeddings
def generate_embeddings(chunks):
return model.encode(chunks)
# Store in FAISS index
def create_faiss_index(embeddings):
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
return index
# Search in FAISS
def search_index(index, query, chunks, k=3):
query_embedding = model.encode([query])
distances, indices = index.search(query_embedding, k)
return [chunks[i] for i in indices[0]]
# Groq query function with Urdu system prompt
def query_groq(query, context):
messages = [
{
"role": "system",
"content": (
"آپ ایک مددگار اسسٹنٹ ہیں جو ہمیشہ اردو میں جواب دیتا ہے، چاہے سوال اردو یا انگریزی میں ہو۔ "
"براہ کرم نیچے دیے گئے سیاق و سباق اور سوال کی بنیاد پر اردو میں تفصیلی جواب دیں۔"
),
},
{
"role": "user",
"content": f"سیاق و سباق:\n{context}\n\nسوال:\n{query}",
},
]
chat_completion = client.chat.completions.create(
messages=messages,
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit UI
st.title("📚 اردو ڈاکیومنٹس کے لیے RAG ایپ")
uploaded_file = st.file_uploader("پی ڈی ایف یا ورڈ فائل اپلوڈ کریں", type=["pdf", "docx"])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix="." + uploaded_file.name.split('.')[-1]) as tmp:
tmp.write(uploaded_file.read())
tmp_path = tmp.name
if uploaded_file.name.endswith(".pdf"):
text = extract_text_from_pdf(tmp_path)
else:
text = extract_text_from_docx(tmp_path)
chunks = chunk_text(text)
embeddings = generate_embeddings(chunks)
index = create_faiss_index(np.array(embeddings))
st.success("ڈاکیومنٹ پروسیس ہو گیا ہے۔ اب سوال پوچھیں۔")
query = st.text_input("سوال درج کریں")
if query:
top_chunks = search_index(index, query, chunks)
context = "\n".join(top_chunks)
answer = query_groq(query, context)
st.markdown("### جواب:")
st.write(answer)