File size: 5,042 Bytes
91cdd71 aa46227 c7fafda 91cdd71 6efd79e 354fb4d 6887954 6bace28 6887954 6bace28 6887954 6bace28 6887954 6bace28 63a82d8 91cdd71 eb08d30 91cdd71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import streamlit as st
# LangChain (local HF pipeline)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_huggingface import HuggingFacePipeline
#from langchain.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
#from langchain.schema import StrOutputParser
from langchain_core.output_parsers import StrOutputParser
# LlamaIndex (modular imports)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
st.set_page_config(page_title="Tiny LLM Starter", page_icon="🧪", layout="centered")
st.title("🧪 Tiny LLM Starter – LangChain + LlamaIndex")
# ---- Sidebar config ----
st.sidebar.header("Model Settings")
MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small")
MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32)
TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1)
st.sidebar.markdown(
"""
**Tips**
- Uses local CPU (no key required)
- Small model → lower memory, faster cold start
- You can later add an `HF_TOKEN` secret for hosted inference
"""
)
# ---- Cache helpers to avoid reloading on every interaction ----
@st.cache_resource(show_spinner=True)
def load_langchain_pipeline(model_id: str, max_new_tokens: int):
tok = AutoTokenizer.from_pretrained(model_id)
mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id)
gen = pipeline(
task="text2text-generation",
model=mdl,
tokenizer=tok,
max_new_tokens=max_new_tokens,
)
return HuggingFacePipeline(pipeline=gen)
@st.cache_resource(show_spinner=True)
def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float):
# Tiny, fast sentence-transformers model for embeddings
embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Wrap the same tiny HF model for LlamaIndex
try:
config = AutoConfig.from_pretrained(model_id)
if config.model_type in ["t5", "mt5", "bart", "mbart", "pegasus", "marian", "prophetnet"]:
task = "text2text-generation" # encoder-decoder / seq2seq
else:
task = "text-generation"
try:
llm = HuggingFaceLLM(
model_name=model_id,
tokenizer_name=model_id,
task=task,
context_window=2048,
generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature},
device_map="cpu",
)
except TypeError:
llm = HuggingFaceLLM(
model_name=model_id,
tokenizer_name=model_id,
context_window=2048,
generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": float(temperature)},
device_map="cpu",
)
except Exception as e:
config = None
#Settings.embed_model = embed
#Settings.llm = llm
# Load small docs (data/notes.txt)
docs = SimpleDirectoryReader(input_dir="data").load_data()
index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=3)
return query_engine
tab1, tab2 = st.tabs(["🟣 LangChain Chat", "🟡 LlamaIndex mini-RAG"])
# -------- Tab 1: LangChain Chat --------
with tab1:
st.subheader("LangChain (local HF pipeline)")
lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS)
user_q = st.text_input("Ask anything:", value="What is this app?")
if st.button("Generate (LangChain)", type="primary"):
prompt = PromptTemplate.from_template(
"You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:"
)
chain = prompt | lc_llm | StrOutputParser()
with st.spinner("Thinking..."):
out = chain.invoke({"q": user_q})
st.write(out)
# -------- Tab 2: LlamaIndex mini-RAG --------
with tab2:
st.subheader("LlamaIndex over a tiny text file")
st.caption("Uploads are optional; otherwise it uses ./data/notes.txt")
uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"])
# If user uploads a file, write it into ./data and rebuild the index
if uploaded is not None:
os.makedirs("data", exist_ok=True)
with open(os.path.join("data", "user.txt"), "wb") as f:
f.write(uploaded.read())
qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP)
rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?")
if st.button("Search + Answer (LlamaIndex)"):
with st.spinner("Searching + generating..."):
ans = qe.query(rag_q)
st.write(ans.response)
with st.expander("Show retrieved nodes"):
for n in ans.source_nodes:
st.markdown(f"**Score:** {n.score:.3f}")
st.code(n.node.get_content()[:500])
|