geminiClone / app.py
raidAthmaneBenlala's picture
Update app.py
10a9146 verified
import streamlit as st
import os
# Correct imports for newer LangChain versions
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
from langchain_community.llms import LlamaCpp
from huggingface_hub import hf_hub_download
# Page Config
st.set_page_config(page_title="Gemma Free Chat", page_icon="πŸ¦™")
# --- Constants ---
# We use a quantized (compressed) version of Gemma 2 (2B parameters)
# This allows it to run on the FREE Hugging Face CPU tier.
REPO_ID = "bartowski/gemma-2-2b-it-GGUF"
FILENAME = "gemma-2-2b-it-Q5_K_M.gguf"
@st.cache_resource
def load_model():
"""
Downloads and loads the model into memory.
Cached so it doesn't reload on every interaction.
"""
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
# Initialize LlamaCpp (The engine that runs the model locally)
llm = LlamaCpp(
model_path=model_path,
temperature=0.7,
max_tokens=512,
top_p=0.9,
# Context window size (how much it remembers in one go)
n_ctx=2048,
# Important for free tier: turn off verbose logging to save buffer space
verbose=True,
)
return llm
# --- UI Layout ---
st.title("πŸ¦™ Gemma 2 (2B) - Local & Free")
st.markdown(
"""
This chatbot runs **entirely inside this Space** using your CPU.
* **No API Key required.**
* **Model:** Gemma-2-2B-it (Quantized GGUF)
* **Speed:** Might be slower than API models because it runs on free hardware.
"""
)
# --- Initialize Model & State ---
try:
with st.spinner("Loading AI Model (this takes a minute first time)..."):
llm = load_model()
except Exception as e:
st.error(f"Failed to load model: {e}")
st.stop()
# Initialize Chat History
if "messages" not in st.session_state:
st.session_state.messages = [
{"role": "assistant", "content": "Hello! I'm running locally on Gemma 2B. How can I help?"}
]
# Initialize Chain with Memory
if "conversation_chain" not in st.session_state:
# Summary Buffer: Keeps recent messages, summarizes old ones to save RAM/Time
memory = ConversationSummaryBufferMemory(
llm=llm,
max_token_limit=500, # Summarize when history exceeds ~500 tokens
return_messages=True
)
st.session_state.conversation_chain = ConversationChain(
llm=llm,
memory=memory,
verbose=True
)
# --- Chat Interface ---
# 1. Display existing messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# 2. Handle User Input
if prompt := st.chat_input("Type your message..."):
# Add user message to state and UI
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
# Generate Response
if st.session_state.conversation_chain:
with st.chat_message("assistant"):
with st.spinner("Thinking... (CPU working hard 🐒)"):
try:
response = st.session_state.conversation_chain.predict(input=prompt)
st.markdown(response)
st.session_state.messages.append({"role": "assistant", "content": response})
except Exception as e:
st.error(f"Error during generation: {e}")