Spaces:

AIdeaText
/

TestOneLlama

Paused

App Files Files Community

AIdeaText commited on Nov 26, 2024

Commit

1aaec00

verified ·

1 Parent(s): 9428cb3

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -29

app.py CHANGED Viewed

@@ -1,51 +1,92 @@
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 class LlamaDemo:
     def __init__(self):
-        self.model_name = "meta-llama/Llama-2-70b-chat"
-        # Initialize in lazy loading fashion
-        self._pipe = None
     @property
-    def pipe(self):
-        if self._pipe is None:
-            self._pipe = pipeline(
-                "text-generation",
-                model=self.model_name,
                 torch_dtype=torch.float16,
                 device_map="auto",
                 trust_remote_code=True
             )
-        return self._pipe
-    def generate_response(self, prompt: str, max_length: int = 512) -> str:
         # Format prompt for Llama 2 chat
         formatted_prompt = f"[INST] {prompt} [/INST]"
-        # Generate response using pipeline
-        response = self.pipe(
-            formatted_prompt,
-            max_new_tokens=max_length,
-            num_return_sequences=1,
-            temperature=0.7,
-            do_sample=True,
-            top_p=0.9
-        )[0]['generated_text']
-        # Extract response after the instruction tag
         return response.split("[/INST]")[-1].strip()
 def main():
     st.set_page_config(
-        page_title="Llama 2 Chat Demo",
         page_icon="🦙",
         layout="wide"
     )
     st.title("🦙 Llama 2 Chat Demo")
     # Initialize model
     if 'llama' not in st.session_state:
         with st.spinner("Loading Llama 2... This might take a few minutes..."):
@@ -82,13 +123,6 @@ def main():
                         st.error(f"Error: {str(e)}")
     with st.sidebar:
-        st.markdown("""
-        ### About
-        This demo uses Llama-2-70B-chat, a large language model from Meta.
-        The model runs with automatic device mapping and mixed precision for optimal performance.
-        """)
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
             st.experimental_rerun()

 import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+from huggingface_hub import login
+import os
+def init_huggingface():
+    """Initialize Hugging Face authentication either from secrets or user input"""
+    if 'HUGGING_FACE_TOKEN' not in st.session_state:
+        # First try to get from environment variable
+        token = os.getenv('HUGGINGFACE_TOKEN')
+        # If not in environment, check streamlit secrets
+        if not token and 'huggingface_token' in st.secrets:
+            token = st.secrets['huggingface_token']
+        # If still not found, ask user
+        if not token:
+            token = st.text_input('Enter your Hugging Face token:', type='password')
+            if not token:
+                st.warning('Please enter your Hugging Face token to proceed')
+                st.stop()
+        st.session_state['HUGGING_FACE_TOKEN'] = token
+    # Login to Hugging Face
+    login(st.session_state['HUGGING_FACE_TOKEN'])
+    return True
 class LlamaDemo:
     def __init__(self):
+        self.model_name = "meta-llama/Llama-2-70b-chat-hf"
+        self._model = None
+        self._tokenizer = None
     @property
+    def model(self):
+        if self._model is None:
+            self._model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
                 torch_dtype=torch.float16,
                 device_map="auto",
+                trust_remote_code=True,
+                load_in_8bit=True  # Para optimizar memoria
+            )
+        return self._model
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
                 trust_remote_code=True
             )
+        return self._tokenizer
+    def generate_response(self, prompt: str, max_new_tokens: int = 512) -> str:
         # Format prompt for Llama 2 chat
         formatted_prompt = f"[INST] {prompt} [/INST]"
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=1,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response.split("[/INST]")[-1].strip()
 def main():
     st.set_page_config(
+        page_title="Llama 2 Demo",
         page_icon="🦙",
         layout="wide"
     )
     st.title("🦙 Llama 2 Chat Demo")
+    # Initialize Hugging Face authentication
+    if init_huggingface():
+        st.success("Successfully authenticated with Hugging Face!")
     # Initialize model
     if 'llama' not in st.session_state:
         with st.spinner("Loading Llama 2... This might take a few minutes..."):
                         st.error(f"Error: {str(e)}")
     with st.sidebar:
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
             st.experimental_rerun()