import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # Initialize model and tokenizer MODEL_PATH = "gokul-pv/Llama-3.2-1B-Instruct-16bit-CodeArchitect" def load_model(): tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=torch.float32, # Use float32 for CPU device_map="cpu" # Ensure model runs on CPU ) return model, tokenizer class CustomTextStreamer: """Custom streamer that captures only the model's response""" def __init__(self, tokenizer): self.tokenizer = tokenizer self.generated_text = [] self.next_tokens_are_prompt = True def put(self, value): if isinstance(value, torch.Tensor): if len(value.shape) > 1: value = value[0] decoded_text = self.tokenizer.decode(value.tolist(), skip_special_tokens=True) else: decoded_text = value if self.next_tokens_are_prompt: self.next_tokens_are_prompt = False # Skip prompt tokens else: self.generated_text.append(decoded_text) print(decoded_text, end="", flush=True) def end(self): self.next_tokens_are_prompt = True print("") def get_generated_text(self): return "".join(self.generated_text) def analyze_architecture(code_input, temperature=1.5, max_tokens=512): """ Analyze architecture code using the loaded model """ model, tokenizer = load_model() messages = [ { "role": "system", "content": "You are an expert in analyzing system architecture written using code. " "You check the architecture and provide clear and detailed explanations " "regarding how the architecture can be improved for better performance, " "scalability, maintainability, and cost-effectiveness. You also check " "for possible cybersecurity issues and if the components can be " "replaced with newer and better components." }, { "role": "user", "content": code_input } ] # Tokenize input inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ).to("cpu") # Ensure tensors are on CPU # Initialize text streamer text_streamer = CustomTextStreamer(tokenizer) # Generate response with torch.inference_mode(): model.generate( input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, use_cache=True, temperature=temperature, min_p=0.1 ) return text_streamer.get_generated_text() # Create Gradio interface def create_gradio_interface(): with gr.Blocks() as demo: gr.Markdown("# Code Architect") with gr.Row(): with gr.Column(): code_input = gr.Code( label="Input Architecture Code", language="python", lines=10 ) with gr.Row(): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1.5, label="Temperature" ) max_tokens = gr.Slider( minimum=64, maximum=2048, value=512, step=64, label="Max Tokens" ) submit_btn = gr.Button("Analyze Architecture") with gr.Column(): output = gr.Markdown(label="Analysis Results") submit_btn.click( fn=analyze_architecture, inputs=[code_input, temperature, max_tokens], outputs=output ) return demo if __name__ == "__main__": demo = create_gradio_interface() demo.launch( share=True, # Enable sharing server_name="0.0.0.0", # Listen on all network interfaces server_port=7860 # Default Gradio port )