langchain-llama2-7b-chat-uncensored-ggml

Runtime error

App Files Files Community

ffreemt commited on Jul 27, 2023

Commit

0acdcc9

1 Parent(s): cb39675

Update

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +51 -44

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: llama2-7b-chat-uncensored-ggml
 emoji: 🚀
 colorFrom: green
 colorTo: green

 ---
+title: langchain-llama2-7b-chat-uncensored-ggml
 emoji: 🚀
 colorFrom: green
 colorTo: green

app.py CHANGED Viewed

@@ -1,21 +1,34 @@
 """Run codes."""
 # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
 # ruff: noqa: E501
 import os
 import platform
 import random
 import time
-from dataclasses import asdict, dataclass
 from pathlib import Path
 # from types import SimpleNamespace
 import gradio as gr
 import psutil
 from about_time import about_time
-from ctransformers import AutoModelForCausalLM
 from dl_hf_model import dl_hf_model
 from loguru import logger
 filename_list = [
     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
@@ -121,6 +134,7 @@ cpu_count: int = int(_) if _ else 1
 logger.debug(f"{cpu_count=}")
 LLM = None
 try:
     model_loc, file_size = dl_hf_model(url)
@@ -128,10 +142,15 @@ except Exception as exc_:
     logger.error(exc_)
     raise SystemExit(1) from exc_
-LLM = AutoModelForCausalLM.from_pretrained(
-    model_loc,
     model_type="llama",
-    # threads=cpu_count,
 )
 logger.info(f"done load llm {model_loc=} {file_size=}G")
@@ -143,45 +162,33 @@ except Exception:
     # Windows
     logger.warning("Windows, cant run time.tzset()")
-_ = """
-ns = SimpleNamespace(
-    response="",
-    generator=(_ for _ in []),
-)
-# """
-@dataclass
-class GenerationConfig:
-    temperature: float = 0.7
-    top_k: int = 50
-    top_p: float = 0.9
-    repetition_penalty: float = 1.0
-    max_new_tokens: int = 512
-    seed: int = 42
-    reset: bool = False
-    stream: bool = True
-    # threads: int = cpu_count
-    # stop: list[str] = field(default_factory=lambda: [stop_string])
-def generate(
-    question: str,
-    llm=LLM,
-    config: GenerationConfig = GenerationConfig(),
-):
-    """Run model inference, will return a Generator if streaming is true."""
-    # _ = prompt_template.format(question=question)
-    # print(_)
-    prompt = prompt_template.format(question=question)
-    return llm(
-        prompt,
-        **asdict(config),
-    )
-logger.debug(f"{asdict(GenerationConfig())=}")
 def user(user_message, history):
@@ -223,7 +230,7 @@ def bot(history):
         logger.debug("about to generate")
-        config = GenerationConfig(reset=True)
         for elm in generate(user_message, config=config):
             if flag == 1:
                 logger.debug("in the loop")
@@ -251,7 +258,7 @@ def predict_api(prompt):
     logger.debug(f"{prompt=}")
     try:
         # user_prompt = prompt
-        config = GenerationConfig(
             temperature=0.2,
             top_k=10,
             top_p=0.9,
@@ -467,4 +474,4 @@ else:
 concurrency_count = 1
 logger.info(f"{concurrency_count=}")
-block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)

 """Run codes."""
 # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
 # ruff: noqa: E501
+import gc
 import os
 import platform
 import random
 import time
 from pathlib import Path
+from queue import deque
+from typing import Any, Dict, List, Union
 # from types import SimpleNamespace
 import gradio as gr
 import psutil
 from about_time import about_time
+from ctransformers import Config
 from dl_hf_model import dl_hf_model
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+# from ctransformers import AutoModelForCausalLM
+from langchain.llms import CTransformers
+from langchain.schema import LLMResult
 from loguru import logger
+deq = deque()
+sig_end = object() # signals the processing is done
+# from langchain.llms import OpenAI
 filename_list = [
     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
     "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
 logger.debug(f"{cpu_count=}")
 LLM = None
+gc.collect()
 try:
     model_loc, file_size = dl_hf_model(url)
     logger.error(exc_)
     raise SystemExit(1) from exc_
+config = Config()
+config.steam = True
+# LLM = AutoModelForCausalLM.from_pretrained(
+LLM = CTransformers(
+    model=model_loc,
     model_type="llama",
+    threads=cpu_count,
+    callbacks=[StreamingStdOutCallbackHandler()],
 )
 logger.info(f"done load llm {model_loc=} {file_size=}G")
     # Windows
     logger.warning("Windows, cant run time.tzset()")
+class DequeCallbackHandler(BaseCallbackHandler):
+    """Mediate gradio and stream output."""
+    def __init__(self, deq: deque):
+        """Init deque for FIFO, may need to upgrade to queue.Queue or queue.SimpleQueue."""
+        self.q = deq
+    def on_llm_start(
+        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
+    ) -> None:
+        """Run when LLM starts running. Clean the queue."""
+        self.q.clear()
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        self.q.append(token)
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        """Run when LLM ends running."""
+        self.q.append(sig_end)
+    def on_llm_error(
+        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
+    ) -> None:
+        """Run when LLM errors."""
+        self.q.put(sig_end)
 def user(user_message, history):
         logger.debug("about to generate")
+        config = Config(reset=True)
         for elm in generate(user_message, config=config):
             if flag == 1:
                 logger.debug("in the loop")
     logger.debug(f"{prompt=}")
     try:
         # user_prompt = prompt
+        config = Config(
             temperature=0.2,
             top_k=10,
             top_p=0.9,
 concurrency_count = 1
 logger.info(f"{concurrency_count=}")
+# block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)