File size: 3,718 Bytes
6179096
67960d3
6179096
cb969f7
 
67960d3
 
 
6179096
 
67960d3
6179096
67960d3
 
6179096
67960d3
cb969f7
6179096
67960d3
cb969f7
67960d3
cb969f7
67960d3
 
cb969f7
6179096
67960d3
cb969f7
67960d3
cb969f7
 
67960d3
 
 
6179096
67960d3
 
 
 
 
 
 
 
 
6179096
 
67960d3
 
 
6179096
67960d3
6179096
cb969f7
 
6179096
 
 
67960d3
 
 
 
 
 
 
6179096
 
67960d3
 
 
 
 
 
 
 
 
 
 
6179096
67960d3
6179096
 
 
 
 
 
 
67960d3
 
6179096
 
67960d3
6179096
cb969f7
 
 
 
 
67960d3
cb969f7
6179096
67960d3
6179096
 
67960d3
 
 
6179096
67960d3
 
cb969f7
67960d3
cb969f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import tempfile
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import docx
import faiss
from sentence_transformers import SentenceTransformer
from groq import Groq
import numpy as np

# Load sentence transformer model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Initialize Groq client
client = Groq(api_key=os.environ['GROQ_KEY'])

# OCR extraction from PDF
def extract_text_from_pdf(file_path):
    images = convert_from_path(file_path, poppler_path='/usr/bin')  # Make sure Poppler path is correct
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang='urd') + "\n"
    return text

# Text extraction from Word documents
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Chunking text
def chunk_text(text, max_length=500):
    sentences = text.split("\n")
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Generate embeddings
def generate_embeddings(chunks):
    return model.encode(chunks)

# Store in FAISS index
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Search in FAISS
def search_index(index, query, chunks, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

# Groq query function with Urdu system prompt
def query_groq(query, context):
    messages = [
        {
            "role": "system",
            "content": (
                "آپ ایک مددگار اسسٹنٹ ہیں جو ہمیشہ اردو میں جواب دیتا ہے، چاہے سوال اردو یا انگریزی میں ہو۔ "
                "براہ کرم نیچے دیے گئے سیاق و سباق اور سوال کی بنیاد پر اردو میں تفصیلی جواب دیں۔"
            ),
        },
        {
            "role": "user",
            "content": f"سیاق و سباق:\n{context}\n\nسوال:\n{query}",
        },
    ]

    chat_completion = client.chat.completions.create(
        messages=messages,
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Streamlit UI
st.title("📚 اردو ڈاکیومنٹس کے لیے RAG ایپ")
uploaded_file = st.file_uploader("پی ڈی ایف یا ورڈ فائل اپلوڈ کریں", type=["pdf", "docx"])

if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False, suffix="." + uploaded_file.name.split('.')[-1]) as tmp:
        tmp.write(uploaded_file.read())
        tmp_path = tmp.name

    if uploaded_file.name.endswith(".pdf"):
        text = extract_text_from_pdf(tmp_path)
    else:
        text = extract_text_from_docx(tmp_path)

    chunks = chunk_text(text)
    embeddings = generate_embeddings(chunks)
    index = create_faiss_index(np.array(embeddings))

    st.success("ڈاکیومنٹ پروسیس ہو گیا ہے۔ اب سوال پوچھیں۔")
    query = st.text_input("سوال درج کریں")

    if query:
        top_chunks = search_index(index, query, chunks)
        context = "\n".join(top_chunks)
        answer = query_groq(query, context)
        st.markdown("### جواب:")
        st.write(answer)