import os import streamlit as st # LangChain (local HF pipeline) from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain_huggingface import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain.schema import StrOutputParser # LlamaIndex (modular imports) from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.llms.huggingface import HuggingFaceLLM st.set_page_config(page_title="Tiny LLM Starter", page_icon="๐Ÿงช", layout="centered") st.title("๐Ÿงช Tiny LLM Starter โ€“ LangChain + LlamaIndex") # ---- Sidebar config ---- st.sidebar.header("Model Settings") MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small") MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32) TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1) st.sidebar.markdown( """ **Tips** - Uses local CPU (no key required) - Small model โ†’ lower memory, faster cold start - You can later add an `HF_TOKEN` secret for hosted inference """ ) # ---- Cache helpers to avoid reloading on every interaction ---- @st.cache_resource(show_spinner=True) def load_langchain_pipeline(model_id: str, max_new_tokens: int): tok = AutoTokenizer.from_pretrained(model_id) mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id) gen = pipeline( task="text2text-generation", model=mdl, tokenizer=tok, max_new_tokens=max_new_tokens, ) return HuggingFacePipeline(pipeline=gen) @st.cache_resource(show_spinner=True) def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float): # Tiny, fast sentence-transformers model for embeddings embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") # Wrap the same tiny HF model for LlamaIndex llm = HuggingFaceLLM( model_name=model_id, tokenizer_name=model_id, context_window=2048, generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature}, device_map="cpu", ) Settings.embed_model = embed Settings.llm = llm # Load small docs (data/notes.txt) docs = SimpleDirectoryReader(input_dirs=["data"]).load_data() index = VectorStoreIndex.from_documents(docs) query_engine = index.as_query_engine(similarity_top_k=3) return query_engine tab1, tab2 = st.tabs(["๐ŸŸฃ LangChain Chat", "๐ŸŸก LlamaIndex mini-RAG"]) # -------- Tab 1: LangChain Chat -------- with tab1: st.subheader("LangChain (local HF pipeline)") lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS) user_q = st.text_input("Ask anything:", value="What is this app?") if st.button("Generate (LangChain)", type="primary"): prompt = PromptTemplate.from_template( "You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:" ) chain = prompt | lc_llm | StrOutputParser() with st.spinner("Thinking..."): out = chain.invoke({"q": user_q}) st.write(out) # -------- Tab 2: LlamaIndex mini-RAG -------- with tab2: st.subheader("LlamaIndex over a tiny text file") st.caption("Uploads are optional; otherwise it uses ./data/notes.txt") uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"]) # If user uploads a file, write it into ./data and rebuild the index if uploaded is not None: os.makedirs("data", exist_ok=True) with open(os.path.join("data", "user.txt"), "wb") as f: f.write(uploaded.read()) qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP) rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?") if st.button("Search + Answer (LlamaIndex)"): with st.spinner("Searching + generating..."): ans = qe.query(rag_q) st.write(ans.response) with st.expander("Show retrieved nodes"): for n in ans.source_nodes: st.markdown(f"**Score:** {n.score:.3f}") st.code(n.node.get_content()[:500])