Spaces:

lukedaca
/

Muj-chatbot

Sleeping

App Files Files Community

lukedaca commited on Dec 12, 2025

Commit

4e11685

verified ·

1 Parent(s): 5e5a3d4

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -56

app.py CHANGED Viewed

@@ -22,7 +22,6 @@ Pravidla pro tebe:
 4. Pamatuj si, co uživatel říkal v předchozích větách této konverzace.
 """.strip()
 st.set_page_config(page_title="AI Rádce s pamětí", layout="centered")
 st.title("🧠 Chytrý Chatbot (s pamětí)")
@@ -32,35 +31,16 @@ MODEL_REPO = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
 MODEL_FILE = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
-with st.sidebar:
-    st.header("Nastavení")
-    urls_text = st.text_area(
-        "URL zdroje (1 URL na řádek)",
-        value="\n".join(DEFAULT_URLS),
-        height=110,
-    )
-    urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
-    max_new_tokens = st.slider("Max nových tokenů (rychlost)", 32, 256, 128, 16)
-    context_window = st.select_slider("Context window", options=[1024, 2048, 3072, 4096], value=2048)
-    cpu_cnt = os.cpu_count() or 2
-    threads = st.slider("Počet vláken (threads)", 1, min(8, cpu_cnt), min(4, cpu_cnt), 1)
-    batch = st.select_slider("Batch", options=[64, 128, 256, 512], value=256)
-    if st.button("🧹 Resetovat konverzaci"):
-        st.session_state.pop("messages", None)
-        st.session_state.pop("chat_engine", None)
-        st.rerun()
 def create_llm(model_path: str, ctx_win: int, max_tok: int, n_threads: int, n_batch: int) -> LlamaCPP:
     """
-    Kompatibilní konstrukce LlamaCPP napříč verzemi llama-index.
-    Některé verze nepřijímají n_threads/n_batch přímo, ale jen přes model_kwargs.
     """
-    # 1) zkusit přímé parametry (novější/verze dle wrapperu)
     try:
         return LlamaCPP(
             model_path=model_path,
@@ -74,7 +54,7 @@ def create_llm(model_path: str, ctx_win: int, max_tok: int, n_threads: int, n_ba
     except TypeError:
         pass
-    # 2) fallback přes model_kwargs (časté u LlamaIndex wrapperu)
     try:
         return LlamaCPP(
             model_path=model_path,
@@ -85,7 +65,7 @@ def create_llm(model_path: str, ctx_win: int, max_tok: int, n_threads: int, n_ba
             verbose=False,
         )
     except TypeError:
-        # 3) poslední fallback – jen threads (někdy n_batch není podporovaný)
         return LlamaCPP(
             model_path=model_path,
             temperature=0.1,
@@ -97,42 +77,97 @@ def create_llm(model_path: str, ctx_win: int, max_tok: int, n_threads: int, n_ba
 @st.cache_resource
-def load_index_and_settings(urls_tuple: tuple[str, ...], ctx_win: int, max_tok: int, n_threads: int, n_batch: int) -> VectorStoreIndex:
-    # stáhnout GGUF do HF cache
-    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-    llm = create_llm(model_path, ctx_win, max_tok, n_threads, n_batch)
-    Settings.llm = llm
     Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
     docs = SimpleWebPageReader(html_to_text=True).load_data(list(urls_tuple))
     return VectorStoreIndex.from_documents(docs)
-def make_chat_engine() -> object:
-    index = load_index_and_settings(tuple(urls), context_window, max_new_tokens, threads, batch)
-    # paměť per-session (NEcacheovat)
-    memory = ChatMemoryBuffer.from_defaults(token_limit=min(3000, context_window))
     return index.as_chat_engine(
-        chat_mode="context",
         memory=memory,
         system_prompt=SYSTEM_PROMPT,
         verbose=False,
     )
 if "chat_engine" not in st.session_state:
-    with st.spinner("Startuji mozek bota... (načítám model a web)"):
         try:
-            st.session_state.chat_engine = make_chat_engine()
         except Exception as e:
             st.error(f"Chyba při inicializaci: {e}")
             st.stop()
 if "messages" not in st.session_state:
     st.session_state.messages = []
@@ -141,6 +176,47 @@ for msg in st.session_state.messages:
         st.markdown(msg["content"])
 prompt = st.chat_input("Zeptej se (např: Co umíš?)...")
 if prompt:
     st.session_state.messages.append({"role": "user", "content": prompt})
@@ -149,27 +225,25 @@ if prompt:
     with st.chat_message("assistant"):
         placeholder = st.empty()
-        full = ""
-        t0 = time.time()
-        with st.spinner("Přemýšlím..."):
-            try:
-                stream = st.session_state.chat_engine.stream_chat(prompt)
-                for chunk in stream.response_gen:
-                    full += chunk
-                    placeholder.markdown(full)
-                if not full.strip():
-                    full = getattr(stream, "response", None) or "Nedostal jsem žádná data k odpovědi."
-                    placeholder.markdown(full)
-            except Exception as e:
-                full = f"Chyba při generování odpovědi: {e}"
-                placeholder.markdown(full)
-        st.caption(f"Hotovo za {time.time() - t0:.1f}s")
-    st.session_state.messages.append({"role": "assistant", "content": full})

 4. Pamatuj si, co uživatel říkal v předchozích větách této konverzace.
 """.strip()
 st.set_page_config(page_title="AI Rádce s pamětí", layout="centered")
 st.title("🧠 Chytrý Chatbot (s pamětí)")
 MODEL_FILE = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
+def _clamp(v: int, lo: int, hi: int) -> int:
+    return max(lo, min(hi, v))
 def create_llm(model_path: str, ctx_win: int, max_tok: int, n_threads: int, n_batch: int) -> LlamaCPP:
     """
+    Kompatibilní konstrukce napříč verzemi llama-index.
+    Někdy wrapper nepřijme n_threads/n_batch přímo => použijeme model_kwargs.
     """
+    # 1) zkus přímé parametry
     try:
         return LlamaCPP(
             model_path=model_path,
     except TypeError:
         pass
+    # 2) fallback přes model_kwargs
     try:
         return LlamaCPP(
             model_path=model_path,
             verbose=False,
         )
     except TypeError:
+        # 3) poslední fallback: jen threads
         return LlamaCPP(
             model_path=model_path,
             temperature=0.1,
 @st.cache_resource
+def get_model_path() -> str:
+    return hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+@st.cache_resource
+def build_index(urls_tuple: tuple[str, ...]) -> VectorStoreIndex:
+    # Embed model: bez torch/cuda (rychlejší instalace a stabilní na HF)
     Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
     docs = SimpleWebPageReader(html_to_text=True).load_data(list(urls_tuple))
     return VectorStoreIndex.from_documents(docs)
+@st.cache_resource
+def load_llm_cached(ctx_win: int, max_tok: int, n_threads: int, n_batch: int) -> LlamaCPP:
+    model_path = get_model_path()
+    return create_llm(model_path, ctx_win, max_tok, n_threads, n_batch)
+def make_chat_engine(urls_list: list[str], ctx_win: int, max_tok: int, n_threads: int, n_batch: int) -> object:
+    index = build_index(tuple(urls_list))
+    llm = load_llm_cached(ctx_win, max_tok, n_threads, n_batch)
+    Settings.llm = llm  # nastavíme aktivní LLM pro LlamaIndex
+    memory = ChatMemoryBuffer.from_defaults(token_limit=min(1500, ctx_win))
+    # condense_plus_context bývá svižnější / stabilnější než čisté "context"
     return index.as_chat_engine(
+        chat_mode="condense_plus_context",
         memory=memory,
         system_prompt=SYSTEM_PROMPT,
         verbose=False,
     )
+with st.sidebar:
+    st.header("Nastavení")
+    urls_text = st.text_area(
+        "URL zdroje (1 URL na řádek)",
+        value="\n".join(DEFAULT_URLS),
+        height=110,
+    )
+    urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
+    safe_mode = st.toggle("Safe Mode (doporučeno pro HF CPU)", value=True)
+    st.caption("Safe Mode brání nastavení, které na HF CPU typicky 'zamrzne'.")
+    # Uživatelské vstupy
+    user_max_new_tokens = st.slider("Max nových tokenů (rychlost)", 32, 256, 96, 16)
+    user_context_window = st.select_slider("Context window", options=[1024, 2048, 3072, 4096], value=2048)
+    cpu_cnt = os.cpu_count() or 2
+    user_threads = st.slider("Počet vláken (threads)", 1, min(8, cpu_cnt), min(4, cpu_cnt), 1)
+    user_batch = st.select_slider("Batch", options=[64, 128, 256, 512], value=128)
+    if st.button("🧹 Resetovat konverzaci"):
+        st.session_state.pop("messages", None)
+        st.session_state.pop("chat_engine", None)
+        st.rerun()
+# Tvrdé limity (aby se to nezabilo na HF CPU)
+if safe_mode:
+    max_new_tokens = _clamp(user_max_new_tokens, 32, 128)
+    context_window = _clamp(user_context_window, 1024, 2048)
+    threads = _clamp(user_threads, 1, 4)
+    batch = _clamp(user_batch, 64, 256)
+else:
+    max_new_tokens = user_max_new_tokens
+    context_window = user_context_window
+    threads = user_threads
+    batch = user_batch
+st.sidebar.markdown("---")
+st.sidebar.write("Aktivní parametry:")
+st.sidebar.code(
+    f"max_new_tokens={max_new_tokens}\ncontext_window={context_window}\nthreads={threads}\nbatch={batch}"
+)
+# Inicializace enginu
 if "chat_engine" not in st.session_state:
+    with st.spinner("Startuji mozek bota... (model + index)"):
         try:
+            st.session_state.chat_engine = make_chat_engine(urls, context_window, max_new_tokens, threads, batch)
         except Exception as e:
             st.error(f"Chyba při inicializaci: {e}")
             st.stop()
+# Historie zpráv
 if "messages" not in st.session_state:
     st.session_state.messages = []
         st.markdown(msg["content"])
+def generate_answer(prompt: str) -> str:
+    """
+    Robustní generace odpovědi:
+    - zkusíme stream_chat (když funguje)
+    - pokud do 3s nepřiteče žádný chunk, fallback na chat()
+    """
+    engine = st.session_state.chat_engine
+    # 1) Stream pokus
+    try:
+        stream = engine.stream_chat(prompt)
+        full = ""
+        started = time.time()
+        # Některé verze blokují; proto "čekáme na první chunk" max 3s
+        got_any = False
+        for chunk in stream.response_gen:
+            got_any = True
+            full += chunk
+            yield ("stream", full)  # průběžně vracíme text
+            # když se to rozjede, necháme to dojet normálně
+        if got_any and full.strip():
+            return  # výstup už byl odeslán přes yield
+        # pokud nic nepřišlo, padneme do fallbacku
+        if time.time() - started < 3.0:
+            # malá pauza, ať se neflushuje zbytečně
+            time.sleep(0.2)
+    except Exception:
+        # stream nemusí být podporovaný/kompatibilní
+        pass
+    # 2) Fallback: klasický chat() (blokuje, ale aspoň funguje vždy)
+    resp = engine.chat(prompt)
+    answer = getattr(resp, "response", None) or str(resp)
+    yield ("final", answer)
+# Chat input
 prompt = st.chat_input("Zeptej se (např: Co umíš?)...")
 if prompt:
     st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("assistant"):
         placeholder = st.empty()
+        status = st.empty()
+        full_text = ""
+        t0 = time.time()
+        try:
+            # průběžné vykreslování
+            for kind, text in generate_answer(prompt):
+                full_text = text
+                placeholder.markdown(full_text)
+                status.caption(f"Generuji... {time.time() - t0:.1f}s")
+            status.caption(f"Hotovo za {time.time() - t0:.1f}s")
+        except Exception as e:
+            full_text = f"Chyba při generování odpovědi: {e}"
+            placeholder.markdown(full_text)
+    st.session_state.messages.append({"role": "assistant", "content": full_text})