Spaces:

thesnak
/

arabic_rag

Runtime error

App Files Files Community

thesnak commited on Apr 7, 2025

Commit

23325b9

verified ·

1 Parent(s): 908adcc

Create app.py

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Load models
+embedding_model = SentenceTransformer('intfloat/multilingual-e5-base')
+model_name = "silma-ai/SILMA-Kashif-2B-Instruct-v1.0"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Global variables
+documents = []
+index = None
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Function to preprocess document into chunks
+def preprocess_document(text, chunk_size=200):
+    words = text.split()
+    chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+    return chunks
+# Function to generate embeddings
+def generate_embeddings(chunks):
+    embeddings = embedding_model.encode(chunks)
+    return embeddings
+# Function to update FAISS index
+def update_vector_database(chunks, embeddings):
+    global index, documents
+    documents.extend(chunks)
+    embeddings = np.array(embeddings)
+    if index is None:
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
+    index.add(embeddings)
+# Function to retrieve relevant documents
+def retrieve_documents(query, top_k=3):
+    query_embedding = embedding_model.encode([query])
+    distances, indices = index.search(query_embedding, top_k)
+    retrieved_docs = [documents[idx] for idx in indices[0]]
+    return retrieved_docs
+# Function to generate answers
+def generate_answer(context, question):
+    input_text = f"context: {context} question: {question}"
+    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
+    outputs = model.generate(**inputs, max_length=100)
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return answer
+# Function for the full RAG pipeline
+def rag_pipeline(question):
+    retrieved_docs = retrieve_documents(question, top_k=3)
+    context = " ".join(retrieved_docs)
+    answer = generate_answer(context, question)
+    return answer
+# Streamlit app
+st.title("Bilingual RAG Application (Arabic & English)")
+# Upload PDF section
+st.header("Upload a PDF Document")
+pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
+if pdf_file:
+    with st.spinner("Processing PDF..."):
+        # Extract text from PDF
+        text = extract_text_from_pdf(pdf_file)
+        # Preprocess text into chunks
+        chunks = preprocess_document(text)
+        # Generate embeddings and update FAISS index
+        embeddings = generate_embeddings(chunks)
+        update_vector_database(chunks, embeddings)
+        st.success("PDF processed successfully!")
+# Query section
+st.header("Ask a Question")
+question = st.text_input("Enter your question here (in Arabic or English):")
+if question:
+    if not documents:
+        st.error("Please upload a PDF document first.")
+    else:
+        with st.spinner("Generating answer..."):
+            answer = rag_pipeline(question)
+            st.write(f"**Answer:** {answer}")