Spaces:

raidAthmaneBenlala
/

geminiClone

Build error

App Files Files Community

geminiClone / app.py

raidAthmaneBenlala

Update app.py

10a9146 verified 4 months ago

raw

history blame contribute delete

3.46 kB

	import streamlit as st
	import os

	# Correct imports for newer LangChain versions
	from langchain.chains import ConversationChain
	from langchain.memory import ConversationSummaryBufferMemory
	from langchain_community.llms import LlamaCpp
	from huggingface_hub import hf_hub_download

	# Page Config
	st.set_page_config(page_title="Gemma Free Chat", page_icon="🦙")

	# --- Constants ---
	# We use a quantized (compressed) version of Gemma 2 (2B parameters)
	# This allows it to run on the FREE Hugging Face CPU tier.
	REPO_ID = "bartowski/gemma-2-2b-it-GGUF"
	FILENAME = "gemma-2-2b-it-Q5_K_M.gguf"

	@st.cache_resource
	def load_model():
	"""
	Downloads and loads the model into memory.
	Cached so it doesn't reload on every interaction.
	"""
	print(f"Downloading {FILENAME} from {REPO_ID}...")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

	# Initialize LlamaCpp (The engine that runs the model locally)
	llm = LlamaCpp(
	model_path=model_path,
	temperature=0.7,
	max_tokens=512,
	top_p=0.9,
	# Context window size (how much it remembers in one go)
	n_ctx=2048,
	# Important for free tier: turn off verbose logging to save buffer space
	verbose=True,
	)
	return llm

	# --- UI Layout ---
	st.title("🦙 Gemma 2 (2B) - Local & Free")
	st.markdown(
	"""
	This chatbot runs entirely inside this Space using your CPU.
	* No API Key required.
	* Model: Gemma-2-2B-it (Quantized GGUF)
	* Speed: Might be slower than API models because it runs on free hardware.
	"""
	)

	# --- Initialize Model & State ---
	try:
	with st.spinner("Loading AI Model (this takes a minute first time)..."):
	llm = load_model()
	except Exception as e:
	st.error(f"Failed to load model: {e}")
	st.stop()

	# Initialize Chat History
	if "messages" not in st.session_state:
	st.session_state.messages = [
	{"role": "assistant", "content": "Hello! I'm running locally on Gemma 2B. How can I help?"}
	]

	# Initialize Chain with Memory
	if "conversation_chain" not in st.session_state:
	# Summary Buffer: Keeps recent messages, summarizes old ones to save RAM/Time
	memory = ConversationSummaryBufferMemory(
	llm=llm,
	max_token_limit=500, # Summarize when history exceeds ~500 tokens
	return_messages=True
	)

	st.session_state.conversation_chain = ConversationChain(
	llm=llm,
	memory=memory,
	verbose=True
	)

	# --- Chat Interface ---

	# 1. Display existing messages
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# 2. Handle User Input
	if prompt := st.chat_input("Type your message..."):
	# Add user message to state and UI
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)

	# Generate Response
	if st.session_state.conversation_chain:
	with st.chat_message("assistant"):
	with st.spinner("Thinking... (CPU working hard 🐢)"):
	try:
	response = st.session_state.conversation_chain.predict(input=prompt)
	st.markdown(response)
	st.session_state.messages.append({"role": "assistant", "content": response})
	except Exception as e:
	st.error(f"Error during generation: {e}")