File size: 5,042 Bytes
91cdd71
 
 
 
 
 
aa46227
 
c7fafda
 
91cdd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6efd79e
354fb4d
6887954
 
 
 
 
 
 
 
6bace28
6887954
 
6bace28
6887954
6bace28
6887954
 
 
 
 
 
 
 
 
 
 
6bace28
63a82d8
 
91cdd71
 
eb08d30
91cdd71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import streamlit as st

# LangChain (local HF pipeline)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_huggingface import HuggingFacePipeline
#from langchain.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
#from langchain.schema import StrOutputParser
from langchain_core.output_parsers import StrOutputParser

# LlamaIndex (modular imports)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM

st.set_page_config(page_title="Tiny LLM Starter", page_icon="🧪", layout="centered")
st.title("🧪 Tiny LLM Starter – LangChain + LlamaIndex")

# ---- Sidebar config ----
st.sidebar.header("Model Settings")
MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small")
MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32)
TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1)

st.sidebar.markdown(
    """
**Tips**
- Uses local CPU (no key required)
- Small model → lower memory, faster cold start
- You can later add an `HF_TOKEN` secret for hosted inference
"""
)

# ---- Cache helpers to avoid reloading on every interaction ----
@st.cache_resource(show_spinner=True)
def load_langchain_pipeline(model_id: str, max_new_tokens: int):
    tok = AutoTokenizer.from_pretrained(model_id)
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    gen = pipeline(
        task="text2text-generation",
        model=mdl,
        tokenizer=tok,
        max_new_tokens=max_new_tokens,
    )
    return HuggingFacePipeline(pipeline=gen)

@st.cache_resource(show_spinner=True)
def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float):
    # Tiny, fast sentence-transformers model for embeddings
    embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Wrap the same tiny HF model for LlamaIndex
    
    try:
        config = AutoConfig.from_pretrained(model_id)
        if config.model_type in ["t5", "mt5", "bart", "mbart", "pegasus", "marian", "prophetnet"]:
            task = "text2text-generation"   # encoder-decoder / seq2seq
        else:
            task = "text-generation" 

        try:
            llm = HuggingFaceLLM(
            model_name=model_id,
            tokenizer_name=model_id,        
            task=task,    
            context_window=2048,
            generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature},
            device_map="cpu",
            )
        except  TypeError:
            llm = HuggingFaceLLM(
                model_name=model_id,
                tokenizer_name=model_id,
                context_window=2048,
                generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": float(temperature)},
                device_map="cpu",
            )
    except Exception as e:
        config = None
  
    #Settings.embed_model = embed
    #Settings.llm = llm

    # Load small docs (data/notes.txt)
    docs = SimpleDirectoryReader(input_dir="data").load_data()
    index = VectorStoreIndex.from_documents(docs)
    query_engine = index.as_query_engine(similarity_top_k=3)
    return query_engine

tab1, tab2 = st.tabs(["🟣 LangChain Chat", "🟡 LlamaIndex mini-RAG"])

# -------- Tab 1: LangChain Chat --------
with tab1:
    st.subheader("LangChain (local HF pipeline)")
    lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS)

    user_q = st.text_input("Ask anything:", value="What is this app?")
    if st.button("Generate (LangChain)", type="primary"):
        prompt = PromptTemplate.from_template(
            "You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:"
        )
        chain = prompt | lc_llm | StrOutputParser()
        with st.spinner("Thinking..."):
            out = chain.invoke({"q": user_q})
        st.write(out)

# -------- Tab 2: LlamaIndex mini-RAG --------
with tab2:
    st.subheader("LlamaIndex over a tiny text file")
    st.caption("Uploads are optional; otherwise it uses ./data/notes.txt")
    uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"])

    # If user uploads a file, write it into ./data and rebuild the index
    if uploaded is not None:
        os.makedirs("data", exist_ok=True)
        with open(os.path.join("data", "user.txt"), "wb") as f:
            f.write(uploaded.read())

    qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP)

    rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?")
    if st.button("Search + Answer (LlamaIndex)"):
        with st.spinner("Searching + generating..."):
            ans = qe.query(rag_q)
        st.write(ans.response)
        with st.expander("Show retrieved nodes"):
            for n in ans.source_nodes:
                st.markdown(f"**Score:** {n.score:.3f}")
                st.code(n.node.get_content()[:500])