Spaces:

AIdeaText
/

TestOneLlama

Paused

App Files Files Community

AIdeaText commited on Nov 26, 2024

Commit

55ca2dd

verified ·

1 Parent(s): b60aacc

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -70

app.py CHANGED Viewed

@@ -1,71 +1,54 @@
 import streamlit as st
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-from typing import List, Dict
-import time
 class LlamaDemo:
     def __init__(self):
-        # Using TinyLlama, which is open source and doesn't require authentication
-        self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
         # Initialize in lazy loading fashion
-        self._model = None
-        self._tokenizer = None
     @property
-    def model(self):
-        if self._model is None:
-            self._model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True
             )
-        return self._model
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            self._tokenizer = AutoTokenizer.from_pretrained(
-                self.model_name,
-                trust_remote_code=True
-            )
-        return self._tokenizer
     def generate_response(self, prompt: str, max_length: int = 512) -> str:
-        # Format the prompt according to TinyLlama's chat template
-        chat_prompt = f"<|system|>You are a helpful AI assistant.</s><|user|>{prompt}</s><|assistant|>"
-        inputs = self.tokenizer(chat_prompt, return_tensors="pt").to(self.model.device)
-        # Generate response
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=max_length,
-                num_return_sequences=1,
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Remove the prompt from the response
-        response = response.split("<|assistant|>")[-1].strip()
-        return response
 def main():
     st.set_page_config(
-        page_title="Open Source Llama Demo",
         page_icon="🦙",
         layout="wide"
     )
-    st.title("🦙 Open Source Llama Demo")
-    # Initialize session state
     if 'llama' not in st.session_state:
-        with st.spinner("Loading model... This might take a few minutes..."):
             st.session_state.llama = LlamaDemo()
     if 'chat_history' not in st.session_state:
@@ -73,17 +56,11 @@ def main():
     # Chat interface
     with st.container():
-        # Display chat history
         for message in st.session_state.chat_history:
-            role = message["role"]
-            content = message["content"]
-            with st.chat_message(role):
-                st.write(content)
-        # Input for new message
         if prompt := st.chat_input("What would you like to discuss?"):
-            # Add user message to chat history
             st.session_state.chat_history.append({
                 "role": "user",
                 "content": prompt
@@ -92,32 +69,24 @@ def main():
             with st.chat_message("user"):
                 st.write(prompt)
-            # Show assistant response
             with st.chat_message("assistant"):
-                message_placeholder = st.empty()
                 with st.spinner("Thinking..."):
-                    response = st.session_state.llama.generate_response(prompt)
-                    message_placeholder.write(response)
-                # Add assistant response to chat history
-                st.session_state.chat_history.append({
-                    "role": "assistant",
-                    "content": response
-                })
-    # Sidebar with settings and info
     with st.sidebar:
-        st.header("Settings")
-        max_length = st.slider("Maximum response length", 64, 1024, 512)
-        st.markdown("---")
         st.markdown("""
         ### About
-        This demo uses TinyLlama, an open source language model that's smaller but
-        still capable. It's perfect for demonstrations and testing.
-        The model is loaded locally and doesn't require any authentication or API keys.
         """)
         if st.button("Clear Chat History"):

 import streamlit as st
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 class LlamaDemo:
     def __init__(self):
+        self.model_name = "meta-llama/Llama-2-70b-chat-hf"
         # Initialize in lazy loading fashion
+        self._pipe = None
     @property
+    def pipe(self):
+        if self._pipe is None:
+            self._pipe = pipeline(
+                "text-generation",
+                model=self.model_name,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True
             )
+        return self._pipe
     def generate_response(self, prompt: str, max_length: int = 512) -> str:
+        # Format prompt for Llama 2 chat
+        formatted_prompt = f"[INST] {prompt} [/INST]"
+        # Generate response using pipeline
+        response = self.pipe(
+            formatted_prompt,
+            max_new_tokens=max_length,
+            num_return_sequences=1,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.9
+        )[0]['generated_text']
+        # Extract response after the instruction tag
+        return response.split("[/INST]")[-1].strip()
 def main():
     st.set_page_config(
+        page_title="Llama 2 Chat Demo",
         page_icon="🦙",
         layout="wide"
     )
+    st.title("🦙 Llama 2 Chat Demo")
+    # Initialize model
     if 'llama' not in st.session_state:
+        with st.spinner("Loading Llama 2... This might take a few minutes..."):
             st.session_state.llama = LlamaDemo()
     if 'chat_history' not in st.session_state:
     # Chat interface
     with st.container():
         for message in st.session_state.chat_history:
+            with st.chat_message(message["role"]):
+                st.write(message["content"])
         if prompt := st.chat_input("What would you like to discuss?"):
             st.session_state.chat_history.append({
                 "role": "user",
                 "content": prompt
             with st.chat_message("user"):
                 st.write(prompt)
             with st.chat_message("assistant"):
                 with st.spinner("Thinking..."):
+                    try:
+                        response = st.session_state.llama.generate_response(prompt)
+                        st.write(response)
+                        st.session_state.chat_history.append({
+                            "role": "assistant",
+                            "content": response
+                        })
+                    except Exception as e:
+                        st.error(f"Error: {str(e)}")
     with st.sidebar:
         st.markdown("""
         ### About
+        This demo uses Llama-2-70B-chat, a large language model from Meta.
+        The model runs with automatic device mapping and mixed precision for optimal performance.
         """)
         if st.button("Clear Chat History"):