muhammadshaheryar commited on
Commit
7abcf85
·
verified ·
1 Parent(s): 816e88d

Upload rag_app.py

Browse files
Files changed (1) hide show
  1. rag_app.py +83 -0
rag_app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install transformers faiss-cpu PyMuPDF streamlit
2
+
3
+ import fitz # PyMuPDF for PDF handling
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import faiss
6
+ import torch
7
+ import streamlit as st
8
+
9
+ # Load model and tokenizer for embedding
10
+ model_name = "sentence-transformers/all-MiniLM-L6-v2" # Efficient model for embeddings
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModel.from_pretrained(model_name)
13
+
14
+ # Initialize FAISS index for efficient similarity search
15
+ embedding_dim = 384 # Dimension of MiniLM embeddings
16
+ index = faiss.IndexFlatL2(embedding_dim)
17
+ document_chunks = []
18
+ chunk_mappings = []
19
+
20
+ def embed_text(text):
21
+ """Generate embeddings for a text chunk."""
22
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
23
+ with torch.no_grad():
24
+ embeddings = model(**inputs).last_hidden_state.mean(dim=1)
25
+ return embeddings.numpy()
26
+
27
+ def extract_text_from_pdf(file_path):
28
+ """Extract text from a PDF file."""
29
+ text = ""
30
+ with fitz.open(file_path) as pdf:
31
+ for page in pdf:
32
+ text += page.get_text("text")
33
+ return text
34
+
35
+ def chunk_text(text, chunk_size=500):
36
+ """Divide the text into manageable chunks."""
37
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
38
+
39
+ def index_pdf(file_path):
40
+ """Process a PDF file, create embeddings, and store them in FAISS index."""
41
+ text = extract_text_from_pdf(file_path)
42
+ chunks = chunk_text(text)
43
+
44
+ for i, chunk in enumerate(chunks):
45
+ chunk_embedding = embed_text(chunk)
46
+ index.add(chunk_embedding) # Add to FAISS index
47
+ document_chunks.append(chunk)
48
+ chunk_mappings.append((file_path, i)) # Track chunk-to-file mappings
49
+
50
+ print(f"Indexed {len(chunks)} chunks from {file_path}")
51
+
52
+ def search(query, top_k=5):
53
+ """Search for relevant document chunks based on query."""
54
+ query_embedding = embed_text(query)
55
+ distances, indices = index.search(query_embedding, top_k)
56
+
57
+ results = []
58
+ for dist, idx in zip(distances[0], indices[0]):
59
+ file_path, chunk_idx = chunk_mappings[idx]
60
+ results.append({"file": file_path, "text": document_chunks[idx], "distance": dist})
61
+
62
+ return results
63
+
64
+ # Streamlit interface
65
+ st.title("RAG PDF Search System")
66
+
67
+ # Upload PDF files
68
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
69
+ if uploaded_files:
70
+ for uploaded_file in uploaded_files:
71
+ file_path = f"temp_{uploaded_file.name}"
72
+ with open(file_path, "wb") as f:
73
+ f.write(uploaded_file.getbuffer())
74
+ index_pdf(file_path)
75
+
76
+ # Query input
77
+ query = st.text_input("Enter your search query:")
78
+ if query:
79
+ results = search(query)
80
+ for result in results:
81
+ st.write(f"**File:** {result['file']}")
82
+ st.write(result["text"])
83
+ st.write(f"**Relevance Score:** {result['distance']}\n")