Dushyant4342 commited on
Commit
497021f
·
verified ·
1 Parent(s): 09507d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -30
app.py CHANGED
@@ -1,40 +1,155 @@
1
  import gradio as gr
2
- import time
 
3
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- print(f"[{time.time()}] SCRIPT START: Echo Bot Test. PID: {os.getpid()}")
 
 
 
6
 
7
- def echo_chat(message, history):
8
- # message: The user's input string
9
- # history: A list of previous interactions [[user_msg_1, bot_msg_1], [user_msg_2, bot_msg_2], ...]
10
- print(f"[{time.time()}] echo_chat called. Message: '{message}'")
11
-
12
- # Simulate a little bit of work
13
- time.sleep(0.2)
14
-
15
- # The function for gr.ChatInterface should return the bot's response as a string
16
- return f"Echo: {message}"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if __name__ == "__main__":
19
- print(f"[{time.time()}] MAIN: Building Gradio interface (Echo Bot)...")
20
-
21
- # Using gr.ChatInterface for a very standard and robust chat UI
22
- iface = gr.ChatInterface(
23
- fn=echo_chat,
24
- title="Echo Bot Test",
25
- description="Type a message and it will be echoed back. This tests basic Gradio functionality.",
26
- examples=["Hello Gradio!", "Is this working?"],
27
- cache_examples=False # Disable caching for this simple test
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
-
30
- print(f"[{time.time()}] MAIN: Attempting to launch Gradio app (Echo Bot)...")
31
  try:
32
- iface.launch(debug=True) # debug=True gives more verbose Gradio logs
33
- print(f"[{time.time()}] MAIN: Gradio app launch() called (Echo Bot). Monitor logs for 'Application startup complete'.")
34
  except Exception as e:
35
- print(f"[{time.time()}] FATAL ERROR during launch (Echo Bot): {e}")
36
- # As a last resort, try to write the error to a file if logs are inaccessible
37
- with open("launch_error.txt", "w") as f_err:
38
- f_err.write(f"Error during Echo Bot launch: {str(e)}\n")
39
 
40
- print(f"[{time.time()}] SCRIPT END: Echo Bot test app.py has finished executing initial setup code.")
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
+ import time
6
+
7
+ print(f"[{time.time()}] SCRIPT START: Small Local LLM Chat. PID: {os.getpid()}")
8
+
9
+ # --- Configuration ---
10
+ MODEL_NAME = "distilgpt2" # A small and efficient model
11
+ # For slightly larger, try "gpt2" (the smallest version of GPT-2)
12
+ # MODEL_NAME = "gpt2"
13
+
14
+ # Determine device: use CUDA if available, otherwise CPU.
15
+ # For small models on typical HF Spaces, CPU is often the only option or more stable.
16
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"[{time.time()}] Using device: {DEVICE}")
18
+
19
+ # --- Load Model and Tokenizer ---
20
+ # This section can take some time and memory, especially on first run (downloading model)
21
+ model = None
22
+ tokenizer = None
23
+ model_load_error = None
24
+
25
+ try:
26
+ print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
+ print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
29
+
30
+ # Add a padding token if it doesn't exist (common for GPT-2 models)
31
+ if tokenizer and tokenizer.pad_token is None:
32
+ tokenizer.pad_token = tokenizer.eos_token
33
+ print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
34
+
35
+ print(f"[{time.time()}] Loading model {MODEL_NAME} to {DEVICE}...")
36
+ # For CPU, ensure model is explicitly moved. For 'auto', it might try GPU.
37
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
38
+ model.eval() # Set model to evaluation mode
39
+ print(f"[{time.time()}] Model {MODEL_NAME} loaded successfully on {DEVICE}.")
40
+
41
+ except Exception as e:
42
+ model_load_error = str(e)
43
+ print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
44
+ # Fallback or error display will be handled in the Gradio UI
45
+
46
+ # --- Chat Function ---
47
+ def generate_chat_response(message, history):
48
+ """
49
+ Generates a response from the local LLM.
50
+ 'message' is the user's new input.
51
+ 'history' is a list of previous [user, bot] pairs.
52
+ """
53
+ print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
54
 
55
+ if model_load_error or not model or not tokenizer:
56
+ error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
57
+ print(f"[{time.time()}] {error_msg}")
58
+ return error_msg
59
 
60
+ # Basic conversation history formatting (can be improved)
61
+ # We'll prepend the history to the current message to give some context.
62
+ # Keep history short to avoid exceeding max input length for small models.
63
+ prompt = ""
64
+ # Limit history to last 2 turns to keep prompt short
65
+ for user_msg, bot_msg in history[-2:]:
66
+ prompt += f"User: {user_msg}\nBot: {bot_msg}\n"
67
+ prompt += f"User: {message}\nBot:"
 
 
68
 
69
+ try:
70
+ print(f"[{time.time()}] Encoding prompt for model...")
71
+ # Ensure padding_side is set correctly if using padding during generation (though not typical for single prompt generation)
72
+ # tokenizer.padding_side = "left" # Important for decoder-only models if batching
73
+
74
+ inputs = tokenizer.encode_plus(
75
+ prompt,
76
+ return_tensors="pt",
77
+ padding=True, # Pad to max length of batch (or model if single)
78
+ truncation=True,
79
+ max_length=512 # Max input length for the model (distilgpt2 is 1024, but keep it reasonable)
80
+ ).to(DEVICE)
81
+
82
+ input_ids = inputs["input_ids"]
83
+ attention_mask = inputs["attention_mask"]
84
+
85
+ print(f"[{time.time()}] Generating response... Input ID length: {input_ids.shape[1]}")
86
+
87
+ # Generate response
88
+ # `max_length` here is the total length of input + output
89
+ # `max_new_tokens` is usually preferred for controlling output length specifically
90
+ with torch.no_grad(): # Disable gradient calculations for inference
91
+ output_sequences = model.generate(
92
+ input_ids=input_ids,
93
+ attention_mask=attention_mask,
94
+ max_new_tokens=60, # Max number of new tokens to generate
95
+ num_return_sequences=1,
96
+ pad_token_id=tokenizer.pad_token_id, # Use the pad token ID from tokenizer
97
+ eos_token_id=tokenizer.eos_token_id,
98
+ do_sample=True, # Enable sampling for more diverse outputs
99
+ top_k=50, # Consider top_k tokens for sampling
100
+ top_p=0.95, # Use nucleus sampling
101
+ temperature=0.8 # Controls randomness
102
+ )
103
+
104
+ # Decode the generated sequence
105
+ response_text = tokenizer.decode(output_sequences[0][input_ids.shape[-1]:], skip_special_tokens=True)
106
+
107
+ # Basic post-processing: remove potential artifacts or incomplete sentences if needed
108
+ response_text = response_text.strip()
109
+
110
+ print(f"[{time.time()}] Raw generated text: '{response_text}'")
111
+ if not response_text:
112
+ response_text = "I'm not sure how to respond to that right now."
113
+
114
+ return response_text
115
+
116
+ except Exception as e:
117
+ print(f"[{time.time()}] Error during text generation: {e}")
118
+ return f"Error generating response: {e}"
119
+
120
+ # --- Gradio Interface ---
121
  if __name__ == "__main__":
122
+ print(f"[{time.time()}] MAIN: Building Gradio interface (Small Local LLM Chat)...")
123
+
124
+ interface_title = f"Chat with Small Local LLM ({MODEL_NAME})"
125
+ interface_description = f"""
126
+ This app runs a small language model ({MODEL_NAME}) directly in this Space.
127
+ Responses might be slow and simple due to the model's size and CPU processing.
128
+ """
129
+ if model_load_error:
130
+ interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
131
+ elif not model or not tokenizer:
132
+ interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
133
+
134
+
135
+ chat_interface = gr.ChatInterface(
136
+ fn=generate_chat_response,
137
+ title=interface_title,
138
+ description=interface_description,
139
+ examples=[["Hello, who are you?"], ["What is 1+1?"]],
140
+ cache_examples=False, # Disable caching for dynamic model responses
141
+ retry_btn="Retry",
142
+ undo_btn="Delete last",
143
+ clear_btn="Clear chat",
144
  )
145
+
146
+ print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
147
  try:
148
+ chat_interface.queue().launch(debug=True) # queue() for better handling, debug=True for logs
149
+ print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
150
  except Exception as e:
151
+ print(f"[{time.time()}] FATAL ERROR during launch: {e}")
152
+ with open("launch_error.txt", "w") as f_err: # Fallback error logging
153
+ f_err.write(f"Error during Small LLM Chat launch: {str(e)}\n")
 
154
 
155
+ print(f"[{time.time()}] SCRIPT END: Small Local LLM Chat app.py has finished initial setup.")