thesnak commited on
Commit
23325b9
·
verified ·
1 Parent(s): 908adcc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ import numpy as np
6
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
7
+
8
+ # Load models
9
+ embedding_model = SentenceTransformer('intfloat/multilingual-e5-base')
10
+ model_name = "silma-ai/SILMA-Kashif-2B-Instruct-v1.0"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
+
14
+ # Global variables
15
+ documents = []
16
+ index = None
17
+
18
+ # Function to extract text from PDF
19
+ def extract_text_from_pdf(pdf_file):
20
+ reader = PdfReader(pdf_file)
21
+ text = ""
22
+ for page in reader.pages:
23
+ text += page.extract_text()
24
+ return text
25
+
26
+ # Function to preprocess document into chunks
27
+ def preprocess_document(text, chunk_size=200):
28
+ words = text.split()
29
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
30
+ return chunks
31
+
32
+ # Function to generate embeddings
33
+ def generate_embeddings(chunks):
34
+ embeddings = embedding_model.encode(chunks)
35
+ return embeddings
36
+
37
+ # Function to update FAISS index
38
+ def update_vector_database(chunks, embeddings):
39
+ global index, documents
40
+ documents.extend(chunks)
41
+
42
+ embeddings = np.array(embeddings)
43
+ if index is None:
44
+ dimension = embeddings.shape[1]
45
+ index = faiss.IndexFlatL2(dimension) # L2 distance for similarity
46
+
47
+ index.add(embeddings)
48
+
49
+ # Function to retrieve relevant documents
50
+ def retrieve_documents(query, top_k=3):
51
+ query_embedding = embedding_model.encode([query])
52
+ distances, indices = index.search(query_embedding, top_k)
53
+ retrieved_docs = [documents[idx] for idx in indices[0]]
54
+ return retrieved_docs
55
+
56
+ # Function to generate answers
57
+ def generate_answer(context, question):
58
+ input_text = f"context: {context} question: {question}"
59
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
60
+ outputs = model.generate(**inputs, max_length=100)
61
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
62
+ return answer
63
+
64
+ # Function for the full RAG pipeline
65
+ def rag_pipeline(question):
66
+ retrieved_docs = retrieve_documents(question, top_k=3)
67
+ context = " ".join(retrieved_docs)
68
+ answer = generate_answer(context, question)
69
+ return answer
70
+
71
+ # Streamlit app
72
+ st.title("Bilingual RAG Application (Arabic & English)")
73
+
74
+ # Upload PDF section
75
+ st.header("Upload a PDF Document")
76
+ pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
77
+
78
+ if pdf_file:
79
+ with st.spinner("Processing PDF..."):
80
+ # Extract text from PDF
81
+ text = extract_text_from_pdf(pdf_file)
82
+
83
+ # Preprocess text into chunks
84
+ chunks = preprocess_document(text)
85
+
86
+ # Generate embeddings and update FAISS index
87
+ embeddings = generate_embeddings(chunks)
88
+ update_vector_database(chunks, embeddings)
89
+
90
+ st.success("PDF processed successfully!")
91
+
92
+ # Query section
93
+ st.header("Ask a Question")
94
+ question = st.text_input("Enter your question here (in Arabic or English):")
95
+
96
+ if question:
97
+ if not documents:
98
+ st.error("Please upload a PDF document first.")
99
+ else:
100
+ with st.spinner("Generating answer..."):
101
+ answer = rag_pipeline(question)
102
+ st.write(f"**Answer:** {answer}")