|
|
import os |
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
|
from langchain_huggingface import HuggingFacePipeline |
|
|
from langchain.prompts import PromptTemplate |
|
|
from langchain.schema import StrOutputParser |
|
|
|
|
|
|
|
|
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings |
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
from llama_index.llms.huggingface import HuggingFaceLLM |
|
|
|
|
|
st.set_page_config(page_title="Tiny LLM Starter", page_icon="🧪", layout="centered") |
|
|
st.title("🧪 Tiny LLM Starter – LangChain + LlamaIndex") |
|
|
|
|
|
|
|
|
st.sidebar.header("Model Settings") |
|
|
MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small") |
|
|
MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32) |
|
|
TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1) |
|
|
|
|
|
st.sidebar.markdown( |
|
|
""" |
|
|
**Tips** |
|
|
- Uses local CPU (no key required) |
|
|
- Small model → lower memory, faster cold start |
|
|
- You can later add an `HF_TOKEN` secret for hosted inference |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
@st.cache_resource(show_spinner=True) |
|
|
def load_langchain_pipeline(model_id: str, max_new_tokens: int): |
|
|
tok = AutoTokenizer.from_pretrained(model_id) |
|
|
mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id) |
|
|
gen = pipeline( |
|
|
task="text2text-generation", |
|
|
model=mdl, |
|
|
tokenizer=tok, |
|
|
max_new_tokens=max_new_tokens, |
|
|
) |
|
|
return HuggingFacePipeline(pipeline=gen) |
|
|
|
|
|
@st.cache_resource(show_spinner=True) |
|
|
def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float): |
|
|
|
|
|
embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
llm = HuggingFaceLLM( |
|
|
model_name=model_id, |
|
|
tokenizer_name=model_id, |
|
|
context_window=2048, |
|
|
generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature}, |
|
|
device_map="cpu", |
|
|
) |
|
|
|
|
|
Settings.embed_model = embed |
|
|
Settings.llm = llm |
|
|
|
|
|
|
|
|
docs = SimpleDirectoryReader(input_dirs=["data"]).load_data() |
|
|
index = VectorStoreIndex.from_documents(docs) |
|
|
query_engine = index.as_query_engine(similarity_top_k=3) |
|
|
return query_engine |
|
|
|
|
|
tab1, tab2 = st.tabs(["🟣 LangChain Chat", "🟡 LlamaIndex mini-RAG"]) |
|
|
|
|
|
|
|
|
with tab1: |
|
|
st.subheader("LangChain (local HF pipeline)") |
|
|
lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS) |
|
|
|
|
|
user_q = st.text_input("Ask anything:", value="What is this app?") |
|
|
if st.button("Generate (LangChain)", type="primary"): |
|
|
prompt = PromptTemplate.from_template( |
|
|
"You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:" |
|
|
) |
|
|
chain = prompt | lc_llm | StrOutputParser() |
|
|
with st.spinner("Thinking..."): |
|
|
out = chain.invoke({"q": user_q}) |
|
|
st.write(out) |
|
|
|
|
|
|
|
|
with tab2: |
|
|
st.subheader("LlamaIndex over a tiny text file") |
|
|
st.caption("Uploads are optional; otherwise it uses ./data/notes.txt") |
|
|
uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"]) |
|
|
|
|
|
|
|
|
if uploaded is not None: |
|
|
os.makedirs("data", exist_ok=True) |
|
|
with open(os.path.join("data", "user.txt"), "wb") as f: |
|
|
f.write(uploaded.read()) |
|
|
|
|
|
qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP) |
|
|
|
|
|
rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?") |
|
|
if st.button("Search + Answer (LlamaIndex)"): |
|
|
with st.spinner("Searching + generating..."): |
|
|
ans = qe.query(rag_q) |
|
|
st.write(ans.response) |
|
|
with st.expander("Show retrieved nodes"): |
|
|
for n in ans.source_nodes: |
|
|
st.markdown(f"**Score:** {n.score:.3f}") |
|
|
st.code(n.node.get_content()[:500]) |
|
|
|