Spaces:
Build error
Build error
File size: 3,718 Bytes
6179096 67960d3 6179096 cb969f7 67960d3 6179096 67960d3 6179096 67960d3 6179096 67960d3 cb969f7 6179096 67960d3 cb969f7 67960d3 cb969f7 67960d3 cb969f7 6179096 67960d3 cb969f7 67960d3 cb969f7 67960d3 6179096 67960d3 6179096 67960d3 6179096 67960d3 6179096 cb969f7 6179096 67960d3 6179096 67960d3 6179096 67960d3 6179096 67960d3 6179096 67960d3 6179096 cb969f7 67960d3 cb969f7 6179096 67960d3 6179096 67960d3 6179096 67960d3 cb969f7 67960d3 cb969f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | import os
import tempfile
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import docx
import faiss
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np
# Load sentence transformer model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# Initialize Groq client
client = Groq(api_key=os.environ['GROQ_KEY'])
# OCR extraction from PDF
def extract_text_from_pdf(file_path):
images = convert_from_path(file_path, poppler_path='/usr/bin') # Make sure Poppler path is correct
text = ""
for image in images:
text += pytesseract.image_to_string(image, lang='urd') + "\n"
return text
# Text extraction from Word documents
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
# Chunking text
def chunk_text(text, max_length=500):
sentences = text.split("\n")
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_length:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Generate embeddings
def generate_embeddings(chunks):
return model.encode(chunks)
# Store in FAISS index
def create_faiss_index(embeddings):
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
return index
# Search in FAISS
def search_index(index, query, chunks, k=3):
query_embedding = model.encode([query])
distances, indices = index.search(query_embedding, k)
return [chunks[i] for i in indices[0]]
# Groq query function with Urdu system prompt
def query_groq(query, context):
messages = [
{
"role": "system",
"content": (
"آپ ایک مددگار اسسٹنٹ ہیں جو ہمیشہ اردو میں جواب دیتا ہے، چاہے سوال اردو یا انگریزی میں ہو۔ "
"براہ کرم نیچے دیے گئے سیاق و سباق اور سوال کی بنیاد پر اردو میں تفصیلی جواب دیں۔"
),
},
{
"role": "user",
"content": f"سیاق و سباق:\n{context}\n\nسوال:\n{query}",
},
]
chat_completion = client.chat.completions.create(
messages=messages,
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Streamlit UI
st.title("📚 اردو ڈاکیومنٹس کے لیے RAG ایپ")
uploaded_file = st.file_uploader("پی ڈی ایف یا ورڈ فائل اپلوڈ کریں", type=["pdf", "docx"])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix="." + uploaded_file.name.split('.')[-1]) as tmp:
tmp.write(uploaded_file.read())
tmp_path = tmp.name
if uploaded_file.name.endswith(".pdf"):
text = extract_text_from_pdf(tmp_path)
else:
text = extract_text_from_docx(tmp_path)
chunks = chunk_text(text)
embeddings = generate_embeddings(chunks)
index = create_faiss_index(np.array(embeddings))
st.success("ڈاکیومنٹ پروسیس ہو گیا ہے۔ اب سوال پوچھیں۔")
query = st.text_input("سوال درج کریں")
if query:
top_chunks = search_index(index, query, chunks)
context = "\n".join(top_chunks)
answer = query_groq(query, context)
st.markdown("### جواب:")
st.write(answer)
|