Spaces:
Build error
Build error
| import streamlit as st | |
| import os | |
| # Correct imports for newer LangChain versions | |
| from langchain.chains import ConversationChain | |
| from langchain.memory import ConversationSummaryBufferMemory | |
| from langchain_community.llms import LlamaCpp | |
| from huggingface_hub import hf_hub_download | |
| # Page Config | |
| st.set_page_config(page_title="Gemma Free Chat", page_icon="π¦") | |
| # --- Constants --- | |
| # We use a quantized (compressed) version of Gemma 2 (2B parameters) | |
| # This allows it to run on the FREE Hugging Face CPU tier. | |
| REPO_ID = "bartowski/gemma-2-2b-it-GGUF" | |
| FILENAME = "gemma-2-2b-it-Q5_K_M.gguf" | |
| def load_model(): | |
| """ | |
| Downloads and loads the model into memory. | |
| Cached so it doesn't reload on every interaction. | |
| """ | |
| print(f"Downloading {FILENAME} from {REPO_ID}...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
| # Initialize LlamaCpp (The engine that runs the model locally) | |
| llm = LlamaCpp( | |
| model_path=model_path, | |
| temperature=0.7, | |
| max_tokens=512, | |
| top_p=0.9, | |
| # Context window size (how much it remembers in one go) | |
| n_ctx=2048, | |
| # Important for free tier: turn off verbose logging to save buffer space | |
| verbose=True, | |
| ) | |
| return llm | |
| # --- UI Layout --- | |
| st.title("π¦ Gemma 2 (2B) - Local & Free") | |
| st.markdown( | |
| """ | |
| This chatbot runs **entirely inside this Space** using your CPU. | |
| * **No API Key required.** | |
| * **Model:** Gemma-2-2B-it (Quantized GGUF) | |
| * **Speed:** Might be slower than API models because it runs on free hardware. | |
| """ | |
| ) | |
| # --- Initialize Model & State --- | |
| try: | |
| with st.spinner("Loading AI Model (this takes a minute first time)..."): | |
| llm = load_model() | |
| except Exception as e: | |
| st.error(f"Failed to load model: {e}") | |
| st.stop() | |
| # Initialize Chat History | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [ | |
| {"role": "assistant", "content": "Hello! I'm running locally on Gemma 2B. How can I help?"} | |
| ] | |
| # Initialize Chain with Memory | |
| if "conversation_chain" not in st.session_state: | |
| # Summary Buffer: Keeps recent messages, summarizes old ones to save RAM/Time | |
| memory = ConversationSummaryBufferMemory( | |
| llm=llm, | |
| max_token_limit=500, # Summarize when history exceeds ~500 tokens | |
| return_messages=True | |
| ) | |
| st.session_state.conversation_chain = ConversationChain( | |
| llm=llm, | |
| memory=memory, | |
| verbose=True | |
| ) | |
| # --- Chat Interface --- | |
| # 1. Display existing messages | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.markdown(message["content"]) | |
| # 2. Handle User Input | |
| if prompt := st.chat_input("Type your message..."): | |
| # Add user message to state and UI | |
| st.session_state.messages.append({"role": "user", "content": prompt}) | |
| with st.chat_message("user"): | |
| st.markdown(prompt) | |
| # Generate Response | |
| if st.session_state.conversation_chain: | |
| with st.chat_message("assistant"): | |
| with st.spinner("Thinking... (CPU working hard π’)"): | |
| try: | |
| response = st.session_state.conversation_chain.predict(input=prompt) | |
| st.markdown(response) | |
| st.session_state.messages.append({"role": "assistant", "content": response}) | |
| except Exception as e: | |
| st.error(f"Error during generation: {e}") |