langApp2 / src /streamlit_app.py
VcRlAgent's picture
all updates
91cdd71
import os
import streamlit as st
# LangChain (local HF pipeline)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
# LlamaIndex (modular imports)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
st.set_page_config(page_title="Tiny LLM Starter", page_icon="🧪", layout="centered")
st.title("🧪 Tiny LLM Starter – LangChain + LlamaIndex")
# ---- Sidebar config ----
st.sidebar.header("Model Settings")
MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small")
MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32)
TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1)
st.sidebar.markdown(
"""
**Tips**
- Uses local CPU (no key required)
- Small model → lower memory, faster cold start
- You can later add an `HF_TOKEN` secret for hosted inference
"""
)
# ---- Cache helpers to avoid reloading on every interaction ----
@st.cache_resource(show_spinner=True)
def load_langchain_pipeline(model_id: str, max_new_tokens: int):
tok = AutoTokenizer.from_pretrained(model_id)
mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id)
gen = pipeline(
task="text2text-generation",
model=mdl,
tokenizer=tok,
max_new_tokens=max_new_tokens,
)
return HuggingFacePipeline(pipeline=gen)
@st.cache_resource(show_spinner=True)
def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float):
# Tiny, fast sentence-transformers model for embeddings
embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Wrap the same tiny HF model for LlamaIndex
llm = HuggingFaceLLM(
model_name=model_id,
tokenizer_name=model_id,
context_window=2048,
generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature},
device_map="cpu",
)
Settings.embed_model = embed
Settings.llm = llm
# Load small docs (data/notes.txt)
docs = SimpleDirectoryReader(input_dirs=["data"]).load_data()
index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=3)
return query_engine
tab1, tab2 = st.tabs(["🟣 LangChain Chat", "🟡 LlamaIndex mini-RAG"])
# -------- Tab 1: LangChain Chat --------
with tab1:
st.subheader("LangChain (local HF pipeline)")
lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS)
user_q = st.text_input("Ask anything:", value="What is this app?")
if st.button("Generate (LangChain)", type="primary"):
prompt = PromptTemplate.from_template(
"You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:"
)
chain = prompt | lc_llm | StrOutputParser()
with st.spinner("Thinking..."):
out = chain.invoke({"q": user_q})
st.write(out)
# -------- Tab 2: LlamaIndex mini-RAG --------
with tab2:
st.subheader("LlamaIndex over a tiny text file")
st.caption("Uploads are optional; otherwise it uses ./data/notes.txt")
uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"])
# If user uploads a file, write it into ./data and rebuild the index
if uploaded is not None:
os.makedirs("data", exist_ok=True)
with open(os.path.join("data", "user.txt"), "wb") as f:
f.write(uploaded.read())
qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP)
rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?")
if st.button("Search + Answer (LlamaIndex)"):
with st.spinner("Searching + generating..."):
ans = qe.query(rag_q)
st.write(ans.response)
with st.expander("Show retrieved nodes"):
for n in ans.source_nodes:
st.markdown(f"**Score:** {n.score:.3f}")
st.code(n.node.get_content()[:500])