Dushyant4342 commited on
Commit
d0364db
·
verified ·
1 Parent(s): b27e9cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -169
app.py CHANGED
@@ -1,212 +1,103 @@
1
- # app.py — RAG PDF Chat (phi-2 + LlamaIndex) in Gradio
2
- # ------------------------------------------------------------------
3
- # • LLM: microsoft/phi-2
4
- # • Embedding: BAAI/bge-small-en-v1.5
5
- # • UI: Gradio Blocks
6
- # • Retrieval: LlamaIndex VectorStoreIndex (one per PDF)
7
- # ------------------------------------------------------------------
8
-
9
  import gradio as gr
10
- import tempfile
11
- import gc
12
- from pathlib import Path
13
- from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
14
- from llama_index.core.settings import Settings
15
- from llama_index.llms.huggingface import HuggingFaceLLM
16
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
17
- import torch # Explicitly import torch to check availability early
18
-
19
- print("Script starting...")
20
-
21
- # ---------------- LLM & Embeddings ----------------
22
- print("Initializing LLM and Embeddings...")
23
- try:
24
- Settings.llm = HuggingFaceLLM(
25
- model_name="microsoft/phi-2",
26
- tokenizer_name="microsoft/phi-2",
27
- device_map="auto", # Requires accelerate
28
- model_kwargs={"trust_remote_code": True}, # Often needed for Phi-2
29
- generate_kwargs={"temperature": 0.2, "max_new_tokens": 256, "repetition_penalty": 1.2},
30
- )
31
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
32
- print("LLM and Embeddings initialized successfully.")
33
- except Exception as e:
34
- print(f"Error initializing LLM or Embeddings: {e}")
35
- # Optionally, re-raise or handle as appropriate for your app
36
- # For now, we'll let it proceed to see if Gradio UI can at least load
37
- # to show the error, but a real app might stop here.
38
- Settings.llm = None # Ensure it's None if failed
39
- Settings.embed_model = None
40
-
41
- # ---------------- Helpers ----------------
42
- def build_index(path: str) -> VectorStoreIndex:
43
- """Create a VectorStoreIndex from the PDF at path."""
44
- print(f"Building index for: {path}")
45
- # Ensure SimpleDirectoryReader is robust
46
- try:
47
- docs = SimpleDirectoryReader(input_files=[path]).load_data()
48
- if not docs:
49
- print(f"No documents loaded from {path}. Check PDF content and reader.")
50
- # Handle empty or unreadable PDF gracefully
51
- return VectorStoreIndex.from_documents([Document(text="Error: Could not read PDF or PDF is empty.")])
52
- index = VectorStoreIndex.from_documents(docs)
53
- print(f"Index built successfully for: {path}")
54
- return index
55
- except Exception as e:
56
- print(f"Error building index for {path}: {e}")
57
- # Return a dummy index or raise an error that can be caught by the UI
58
- return VectorStoreIndex.from_documents([Document(text=f"Error processing PDF: {e}")])
59
-
60
-
61
- # ---------------- Gradio logic ----------------
62
- def add_pdfs(files, current_state):
63
- """Handle file upload, build indexes, return updated dropdown choices."""
64
- print("Adding PDFs...")
65
- indexes, chat_hist = current_state if current_state else ({}, [])
66
-
67
- if files is None:
68
- print("No files uploaded.")
69
- choices = list(indexes.keys())
70
- return gr.Dropdown.update(choices=choices, value=choices[0] if choices else None), (indexes, chat_hist)
71
-
72
- for f_obj in files: # Gradio File component gives a list of tempfile._TemporaryFileWrapper
73
- original_filename = f_obj.name # This is the path to the temporary file
74
- # Use a more descriptive name if possible, or stick to the temp name if original not easily available
75
- # For this example, we'll use the temp file's name as key, but ideally, you'd want the original upload name.
76
- # Gradio's File component might not directly give original filename easily without custom JS.
77
- # Let's assume f_obj.name is unique enough for this context or use a counter.
78
- # For simplicity, we'll use the temp file path as the key, but this is not ideal for display.
79
- # A better approach would be to get the original filename if the Gradio version supports it easily,
80
- # or manage it via the UI.
81
-
82
- # Let's use Path(original_filename).name to get just the filename part of the temp path
83
- display_name = Path(original_filename).name
84
-
85
- if display_name in indexes:
86
- print(f"Index for {display_name} already exists. Skipping.")
87
- continue
88
-
89
- # The file `f_obj` is already a file-like object pointing to the uploaded content.
90
- # We need its path. `f_obj.name` gives the path to the temporary file Gradio creates.
91
- try:
92
- print(f"Processing file: {display_name} from path: {original_filename}")
93
- # No need to write to another tempfile, Gradio already provides one.
94
- idx = build_index(original_filename)
95
- indexes[display_name] = idx # Use display_name as key
96
- print(f"Index for {display_name} added.")
97
- except Exception as e:
98
- print(f"Failed to process file {display_name}: {e}")
99
- # Optionally, inform the user via the UI
100
- # For now, just log and skip.
101
-
102
- gc.collect() # Clean up memory
103
- choices = list(indexes.keys())
104
- updated_value = choices[0] if choices else None
105
- print(f"PDFs processed. Choices: {choices}, Selected: {updated_value}")
106
- return gr.Dropdown.update(choices=choices, value=updated_value), (indexes, chat_hist)
107
-
108
- def chat(query, pdf_choice, current_state):
109
- """Handle chat query with the selected PDF."""
110
- print(f"Chat query: '{query}' for PDF: '{pdf_choice}'")
111
- indexes, chat_hist = current_state
112
-
113
- if not Settings.llm or not Settings.embed_model:
114
- answer = "⚠️ LLM or Embedding model not initialized. Please check server logs."
115
- chat_hist = chat_hist + [[query, answer]]
116
- return chat_hist, (indexes, chat_hist)
117
-
118
- if not pdf_choice or pdf_choice not in indexes:
119
- answer = "⚠️ Please select a PDF to chat with, or the selected PDF index is not available."
120
- if not pdf_choice:
121
- print("No PDF selected for chat.")
122
- else:
123
- print(f"PDF choice '{pdf_choice}' not found in indexes: {list(indexes.keys())}")
124
- chat_hist = chat_hist + [[query, answer]]
125
- return chat_hist, (indexes, chat_hist)
126
-
127
- query_engine = indexes[pdf_choice].as_query_engine(similarity_top_k=4)
128
- try:
129
- print(f"Querying engine for PDF: {pdf_choice}...")
130
- response = query_engine.query(query)
131
- answer = response.response
132
- print("Query successful.")
133
- except Exception as e:
134
- answer = f"⚠️ Error during query: {e}"
135
- print(f"Exception during query: {e}")
136
 
137
- chat_hist = chat_hist + [[query, answer]]
138
- return chat_hist, (indexes, chat_hist)
 
 
139
 
140
  def clear_chat_and_query(current_state):
141
- """Clears the chatbot and the query box."""
142
- indexes, _ = current_state # Keep indexes
143
- return [], (indexes, []), "" # Clear chatbot, new empty chat_hist, clear query_box
 
144
 
145
- print("Building Gradio interface...")
146
  with gr.Blocks(theme=gr.themes.Soft(), css="footer {display:none}") as demo:
147
- gr.Markdown("## 📄 Chat with any PDF   |   **microsoft/phi-2 + LlamaIndex**")
 
148
 
149
- # (indexes dict: {filename: VectorStoreIndex}, chat_history list: [[user_msg, bot_msg], ...])
150
- # Initialize with empty dict for indexes and empty list for chat_hist
151
- app_state = gr.State(({}, []))
152
 
153
  with gr.Row():
154
  with gr.Column(scale=1, min_width=300):
155
  file_box = gr.File(
156
- label="Upload PDF(s)",
157
  file_types=[".pdf"],
158
  file_count="multiple"
159
  )
160
  pdf_select = gr.Dropdown(
161
- label="Choose a PDF to chat with",
162
  interactive=True
163
  )
164
  with gr.Column(scale=3, min_width=500):
165
  chatbot = gr.Chatbot(
166
- label="Conversation",
167
  bubble_full_width=False,
168
  height=500
169
  )
170
  query_box = gr.Textbox(
171
- label="Ask a question…",
172
  placeholder="Type your question here and press Enter.",
173
- scale=4
174
  )
175
  clear_button = gr.Button("Clear Chat")
176
 
177
-
178
- # Event handlers
179
  file_box.upload(
180
- fn=add_pdfs,
181
  inputs=[file_box, app_state],
182
  outputs=[pdf_select, app_state]
183
  )
184
-
185
- # When a PDF is selected from dropdown, or when files are uploaded and dropdown is updated
186
- # you might want to clear the chat history for the new PDF.
187
- # This can be chained or handled in add_pdfs if desired.
188
- # For now, chat is persistent until "Clear Chat" is pressed.
189
-
190
  query_box.submit(
191
- fn=chat,
192
  inputs=[query_box, pdf_select, app_state],
193
  outputs=[chatbot, app_state]
194
  )
195
-
196
- # Clear button functionality
197
  clear_button.click(
198
  fn=clear_chat_and_query,
199
  inputs=[app_state],
200
- outputs=[chatbot, app_state, query_box] # chatbot, app_state (to reset chat_hist), query_box
201
  )
202
 
203
- print("Gradio Blocks defined.")
204
 
205
  if __name__ == "__main__":
206
- print("Launching Gradio app...")
207
- # For Hugging Face Spaces, demo.launch() is usually sufficient.
208
- # queue() is good for handling multiple users.
209
- # Ensure share=False (default) or not set, as Spaces handles public access.
210
- demo.queue().launch()
211
- print("Gradio app launched.")
 
 
 
 
 
212
 
 
 
1
+ # app.py — MINIMAL TEST VERSION
 
 
 
 
 
 
 
2
  import gradio as gr
3
+ import time # For a small delay to help logs catch up if needed
4
+
5
+ print(f"[{time.time()}] SCRIPT START: Minimal test app.py is running.")
6
+
7
+ # ---------------- Gradio logic (Simplified) ----------------
8
+ def dummy_add_pdfs(files, current_state):
9
+ print(f"[{time.time()}] dummy_add_pdfs called.")
10
+ # Simulate some processing
11
+ time.sleep(0.1)
12
+ # Don't actually process files, just update UI
13
+ choices = ["PDF A (mock)", "PDF B (mock)"] if files else []
14
+ selected_choice = choices[0] if choices else None
15
+ print(f"[{time.time()}] dummy_add_pdfs: Choices: {choices}, Selected: {selected_choice}")
16
+ # Ensure state is a tuple
17
+ state = current_state if isinstance(current_state, tuple) else (None, [])
18
+ return gr.Dropdown.update(choices=choices, value=selected_choice), state
19
+
20
+ def dummy_chat(query, pdf_choice, current_state):
21
+ print(f"[{time.time()}] dummy_chat called with query: '{query}', PDF: '{pdf_choice}'")
22
+ # Simulate some processing
23
+ time.sleep(0.1)
24
+ # Ensure state is a tuple and history is a list
25
+ history = current_state[1] if isinstance(current_state, tuple) and len(current_state) > 1 and isinstance(current_state[1], list) else []
26
+
27
+ answer = f"This is a **mock** response to '{query}' for '{pdf_choice}'. Model loading is disabled for this test."
28
+ history = history + [[query, answer]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Ensure state is correctly structured before returning
31
+ new_state = (current_state[0] if isinstance(current_state, tuple) else None, history)
32
+ print(f"[{time.time()}] dummy_chat: History updated. Returning new state.")
33
+ return history, new_state
34
 
35
  def clear_chat_and_query(current_state):
36
+ print(f"[{time.time()}] clear_chat_and_query called.")
37
+ # Ensure state is correctly structured
38
+ indexes = current_state[0] if isinstance(current_state, tuple) else None
39
+ return [], (indexes, []), ""
40
 
41
+ print(f"[{time.time()}] Building Gradio interface (minimal)...")
42
  with gr.Blocks(theme=gr.themes.Soft(), css="footer {display:none}") as demo:
43
+ gr.Markdown("## 📄 Minimal Test: PDF Chat App (Models Disabled)")
44
+ gr.Markdown("### If you see this, Gradio started. Model loading is bypassed.")
45
 
46
+ # Initialize state correctly as a tuple: (indexes_placeholder, chat_history_list)
47
+ app_state = gr.State((None, []))
 
48
 
49
  with gr.Row():
50
  with gr.Column(scale=1, min_width=300):
51
  file_box = gr.File(
52
+ label="Upload PDF(s) (Mock)",
53
  file_types=[".pdf"],
54
  file_count="multiple"
55
  )
56
  pdf_select = gr.Dropdown(
57
+ label="Choose a PDF (Mock)",
58
  interactive=True
59
  )
60
  with gr.Column(scale=3, min_width=500):
61
  chatbot = gr.Chatbot(
62
+ label="Conversation (Mock)",
63
  bubble_full_width=False,
64
  height=500
65
  )
66
  query_box = gr.Textbox(
67
+ label="Ask a question (Mock)…",
68
  placeholder="Type your question here and press Enter.",
 
69
  )
70
  clear_button = gr.Button("Clear Chat")
71
 
 
 
72
  file_box.upload(
73
+ fn=dummy_add_pdfs,
74
  inputs=[file_box, app_state],
75
  outputs=[pdf_select, app_state]
76
  )
 
 
 
 
 
 
77
  query_box.submit(
78
+ fn=dummy_chat,
79
  inputs=[query_box, pdf_select, app_state],
80
  outputs=[chatbot, app_state]
81
  )
 
 
82
  clear_button.click(
83
  fn=clear_chat_and_query,
84
  inputs=[app_state],
85
+ outputs=[chatbot, app_state, query_box]
86
  )
87
 
88
+ print(f"[{time.time()}] Gradio Blocks defined (minimal).")
89
 
90
  if __name__ == "__main__":
91
+ print(f"[{time.time()}] MAIN: Attempting to launch Gradio app (minimal)...")
92
+ try:
93
+ # Adding a small delay before launch, sometimes helps with log flushing in constrained envs
94
+ # time.sleep(2)
95
+ demo.queue().launch(debug=True) # Keep debug=True for Gradio logs
96
+ print(f"[{time.time()}] MAIN: Gradio app demo.launch() called (minimal). Monitor for 'Application startup complete'.")
97
+ except Exception as e:
98
+ print(f"[{time.time()}] FATAL ERROR during demo.launch() (minimal): {e}")
99
+ # Write error to a file as a last resort if logs aren't showing
100
+ with open("launch_error.txt", "w") as f_err:
101
+ f_err.write(str(e))
102
 
103
+ print(f"[{time.time()}] SCRIPT END: Minimal test app.py has finished executing initial setup code.")