izhan001 commited on
Commit
b3bf1cf
·
verified ·
1 Parent(s): 93d82df

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ import fitz # PyMuPDF for PDF files
5
+ from docx import Document
6
+ from pptx import Presentation
7
+ import gradio as gr
8
+
9
+ # Initialize SentenceTransformer for embeddings
10
+ retrieve = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
11
+
12
+ # Initialize empty list for documents and embeddings
13
+ documents = []
14
+ doc_embeddings = []
15
+ index = None # FAISS index will be created only when documents are added
16
+
17
+ # Function to process PDF files
18
+ def process_pdf(file_path):
19
+ try:
20
+ doc = fitz.open(file_path)
21
+ text = ""
22
+ for page_num in range(doc.page_count):
23
+ text += doc[page_num].get_text()
24
+ return text
25
+ except Exception as e:
26
+ return f"Error reading PDF: {e}"
27
+
28
+ # Function to process DOCX files
29
+ def process_docx(file_path):
30
+ try:
31
+ doc = Document(file_path)
32
+ text = "\n".join([para.text for para in doc.paragraphs])
33
+ return text
34
+ except Exception as e:
35
+ return f"Error reading DOCX: {e}"
36
+
37
+ # Function to process PPTX files
38
+ def process_pptx(file_path):
39
+ try:
40
+ presentation = Presentation(file_path)
41
+ text = ""
42
+ for slide in presentation.slides:
43
+ for shape in slide.shapes:
44
+ if hasattr(shape, "text"):
45
+ text += shape.text + "\n"
46
+ return text
47
+ except Exception as e:
48
+ return f"Error reading PPTX: {e}"
49
+
50
+ # Function to add a document to the FAISS index
51
+ def add_to_index(text):
52
+ global index, doc_embeddings, documents
53
+ if text.strip(): # Only add non-empty documents
54
+ embedding = retrieve.encode([text])[0]
55
+ doc_embeddings.append(embedding)
56
+ documents.append(text)
57
+ # Update FAISS index
58
+ embeddings_matrix = np.array(doc_embeddings)
59
+ index = faiss.IndexFlatL2(embeddings_matrix.shape[1])
60
+ index.add(embeddings_matrix)
61
+
62
+ # Function to load and process a single document
63
+ def load_document(file_path):
64
+ if file_path.endswith('.pdf'):
65
+ text = process_pdf(file_path)
66
+ elif file_path.endswith('.docx'):
67
+ text = process_docx(file_path)
68
+ elif file_path.endswith('.pptx'):
69
+ text = process_pptx(file_path)
70
+ else:
71
+ return "Unsupported file format"
72
+
73
+ if isinstance(text, str) and "Error" not in text:
74
+ add_to_index(text)
75
+ return "Document loaded and indexed successfully."
76
+ return text # Return error message if processing fails
77
+
78
+ # Retrieve documents based on the query
79
+ def retrieve_docs(query, k=2):
80
+ if not index:
81
+ return ["Index not initialized. Please upload and process a document first."]
82
+ query_embedding = retrieve.encode([query])
83
+ distances, indices = index.search(np.array(query_embedding), k)
84
+ results = [documents[i] for i in indices[0]]
85
+ return results
86
+
87
+ # Generate a response based on retrieved documents
88
+ def generate_response(retrieved_docs):
89
+ if retrieved_docs:
90
+ context = " ".join(retrieved_docs)
91
+ response = f"Generated response based on retrieved docs:\n\n{context[:500]}..." # Placeholder response
92
+ return response
93
+ return "No relevant documents found to generate a response."
94
+
95
+ # Gradio function
96
+ def rag_application(query, file):
97
+ # Load and process the uploaded document if provided
98
+ if file:
99
+ load_result = load_document(file.name)
100
+ if "Error" in load_result:
101
+ return load_result, "" # Return error message if document loading failed
102
+
103
+ # Retrieve relevant documents
104
+ retrieved_docs = retrieve_docs(query)
105
+ docs_output = "\n".join([f"- {doc[:200]}..." for doc in retrieved_docs]) # Display snippets
106
+
107
+ # Generate response
108
+ response = generate_response(retrieved_docs)
109
+ return docs_output, response
110
+
111
+ # Gradio interface
112
+ iface = gr.Interface(
113
+ fn=rag_application,
114
+ inputs=[
115
+ "text", # Query input
116
+ "file" # Single file upload
117
+ ],
118
+ outputs=[
119
+ "text", # Retrieved documents output
120
+ "text" # Generated response output
121
+ ],
122
+ title="RAG Application with Single File Upload",
123
+ description="Upload a PDF, DOCX, or PPTX file and ask questions. The RAG application retrieves relevant documents and generates a response."
124
+ )
125
+
126
+ iface.launch()