Spaces:

BenjaminB
/

gistillery

Runtime error

App Files Files Community

Benjamin Bossan commited on May 8, 2023

Commit

126a4c6

1 Parent(s): a240da9

Refactor ml model handling

Browse files

Files changed (4) hide show

src/db.py +4 -1
src/ml.py +116 -67
src/webservice.py +6 -0
src/worker.py +79 -38

src/db.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 import sqlite3
 from contextlib import contextmanager
 from typing import Generator
@@ -6,6 +7,8 @@ from typing import Generator
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 schema_entries = """
 CREATE TABLE entries
@@ -67,7 +70,7 @@ def _get_db_connection() -> sqlite3.Connection:
     global TABLES_CREATED
     # sqlite cannot deal with concurrent access, so we set a big timeout
-    conn = sqlite3.connect("sqlite-data.db", timeout=30)
     if TABLES_CREATED:
         return conn

 import logging
+import os
 import sqlite3
 from contextlib import contextmanager
 from typing import Generator
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
+db_file = os.getenv("DB_FILE_NAME", "sqlite-data.db")
 schema_entries = """
 CREATE TABLE entries
     global TABLES_CREATED
     # sqlite cannot deal with concurrent access, so we set a big timeout
+    conn = sqlite3.connect(db_file, timeout=30)
     if TABLES_CREATED:
         return conn

src/ml.py CHANGED Viewed

@@ -1,52 +1,126 @@
 import abc
 import logging
 import re
 import httpx
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
 from base import JobInput
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-MODEL_NAME = "google/flan-t5-large"
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-class Summarizer:
     def __init__(self) -> None:
         self.template = "Summarize the text below in two sentences:\n\n{}"
-        self.generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
-        self.generation_config.max_new_tokens = 200
-        self.generation_config.min_new_tokens = 100
-        self.generation_config.top_k = 5
-        self.generation_config.repetition_penalty = 1.5
     def __call__(self, x: str) -> str:
         text = self.template.format(x)
-        inputs = tokenizer(text, return_tensors="pt")
-        outputs = model.generate(**inputs, generation_config=self.generation_config)
-        output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         assert isinstance(output, str)
         return output
     def get_name(self) -> str:
-        return f"Summarizer({MODEL_NAME})"
-class Tagger:
-    def __init__(self) -> None:
         self.template = (
             "Create a list of tags for the text below. The tags should be high level "
             "and specific. Prefix each tag with a hashtag.\n\n{}\n\nTags: #general"
         )
-        self.generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
-        self.generation_config.max_new_tokens = 50
-        self.generation_config.min_new_tokens = 25
-        # increase the temperature to make the model more creative
-        self.generation_config.temperature = 1.5
     def _extract_tags(self, text: str) -> list[str]:
         tags = set()
@@ -57,46 +131,25 @@ class Tagger:
     def __call__(self, x: str) -> list[str]:
         text = self.template.format(x)
-        inputs = tokenizer(text, return_tensors="pt")
-        outputs = model.generate(**inputs, generation_config=self.generation_config)
-        output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         tags = self._extract_tags(output)
         return tags
     def get_name(self) -> str:
-        return f"Tagger({MODEL_NAME})"
-class Processor(abc.ABC):
-    def __call__(self, job: JobInput) -> str:
-        _id = job.id
-        logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})")
-        result = self.process(job)
-        logger.info(f"Finished processing input (id={_id[:8]})")
-        return result
-    def process(self, input: JobInput) -> str:
-        raise NotImplementedError
-    def match(self, input: JobInput) -> bool:
-        raise NotImplementedError
-    def get_name(self) -> str:
-        raise NotImplementedError
-class RawProcessor(Processor):
     def match(self, input: JobInput) -> bool:
         return True
     def process(self, input: JobInput) -> str:
         return input.content
-    def get_name(self) -> str:
-        return self.__class__.__name__
-class PlainUrlProcessor(Processor):
     def __init__(self) -> None:
         self.client = httpx.Client()
         self.regex = re.compile(r"(https?://[^\s]+)")
@@ -118,26 +171,22 @@ class PlainUrlProcessor(Processor):
         text = self.template.format(url=self.url, content=text)
         return text
-    def get_name(self) -> str:
-        return self.__class__.__name__
-class ProcessorRegistry:
-    def __init__(self) -> None:
-        self.registry: list[Processor] = []
-        self.default_registry: list[Processor] = []
-        self.set_default_processors()
-    def set_default_processors(self) -> None:
-        self.default_registry.extend([PlainUrlProcessor(), RawProcessor()])
-    def register(self, processor: Processor) -> None:
-        self.registry.append(processor)
-    def dispatch(self, input: JobInput) -> Processor:
-        for processor in self.registry + self.default_registry:
-            if processor.match(input):
-                return processor
-        # should never be requires, but eh
-        return RawProcessor()

 import abc
+from typing import Any
 import logging
 import re
 import httpx
 from base import JobInput
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
+class Processor(abc.ABC):
+    def get_name(self) -> str:
+        return self.__class__.__name__
+    def __call__(self, job: JobInput) -> str:
+        _id = job.id
+        logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})")
+        result = self.process(job)
+        logger.info(f"Finished processing input (id={_id[:8]})")
+        return result
+    @abc.abstractmethod
+    def process(self, input: JobInput) -> str:
+        raise NotImplementedError
+    @abc.abstractmethod
+    def match(self, input: JobInput) -> bool:
+        raise NotImplementedError
+class Summarizer(abc.ABC):
+    def __init__(self, model_name: str, model: Any, tokenizer: Any, generation_config: Any) -> None:
+        raise NotImplementedError
+    def get_name(self) -> str:
+        raise NotImplementedError
+    @abc.abstractmethod
+    def __call__(self, x: str) -> str:
+        raise NotImplementedError
+class Tagger(abc.ABC):
+    def __init__(self, model_name: str, model: Any, tokenizer: Any, generation_config: Any) -> None:
+        raise NotImplementedError
+    def get_name(self) -> str:
+        raise NotImplementedError
+    @abc.abstractmethod
+    def __call__(self, x: str) -> list[str]:
+        raise NotImplementedError
+class MlRegistry:
     def __init__(self) -> None:
+        self.processors: list[Processor] = []
+        self.summerizer: Summarizer | None = None
+        self.tagger: Tagger | None = None
+        self.model = None
+        self.tokenizer = None
+    def register_processor(self, processor: Processor) -> None:
+        self.processors.append(processor)
+    def register_summarizer(self, summarizer: Summarizer) -> None:
+        self.summerizer = summarizer
+    def register_tagger(self, tagger: Tagger) -> None:
+        self.tagger = tagger
+    def get_processor(self, input: JobInput) -> Processor:
+        assert self.processors
+        for processor in self.processors:
+            if processor.match(input):
+                return processor
+        return RawTextProcessor()
+    def get_summarizer(self) -> Summarizer:
+        assert self.summerizer
+        return self.summerizer
+    def get_tagger(self) -> Tagger:
+        assert self.tagger
+        return self.tagger
+class HfTransformersSummarizer(Summarizer):
+    def __init__(self, model_name: str, model: Any, tokenizer: Any, generation_config: Any) -> None:
+        self.model_name = model_name
+        self.model = model
+        self.tokenizer = tokenizer
+        self.generation_config = generation_config
         self.template = "Summarize the text below in two sentences:\n\n{}"
     def __call__(self, x: str) -> str:
         text = self.template.format(x)
+        inputs = self.tokenizer(text, return_tensors="pt")
+        outputs = self.model.generate(**inputs, generation_config=self.generation_config)
+        output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         assert isinstance(output, str)
         return output
     def get_name(self) -> str:
+        return f"{self.__class__.__name__}({self.model_name})"
+class HfTransformersTagger(Tagger):
+    def __init__(self, model_name: str, model: Any, tokenizer: Any, generation_config: Any) -> None:
+        self.model_name = model_name
+        self.model = model
+        self.tokenizer = tokenizer
+        self.generation_config = generation_config
         self.template = (
             "Create a list of tags for the text below. The tags should be high level "
             "and specific. Prefix each tag with a hashtag.\n\n{}\n\nTags: #general"
         )
     def _extract_tags(self, text: str) -> list[str]:
         tags = set()
     def __call__(self, x: str) -> list[str]:
         text = self.template.format(x)
+        inputs = self.tokenizer(text, return_tensors="pt")
+        outputs = self.model.generate(**inputs, generation_config=self.generation_config)
+        output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
         tags = self._extract_tags(output)
         return tags
     def get_name(self) -> str:
+        return f"{self.__class__.__name__}({self.model_name})"
+class RawTextProcessor(Processor):
     def match(self, input: JobInput) -> bool:
         return True
     def process(self, input: JobInput) -> str:
         return input.content
+class DefaultUrlProcessor(Processor):
     def __init__(self) -> None:
         self.client = httpx.Client()
         self.regex = re.compile(r"(https?://[^\s]+)")
         text = self.template.format(url=self.url, content=text)
         return text
+# class ProcessorRegistry:
+#     def __init__(self) -> None:
+#         self.registry: list[Processor] = []
+#         self.default_registry: list[Processor] = []
+#         self.set_default_processors()
+#     def set_default_processors(self) -> None:
+#         self.default_registry.extend([PlainUrlProcessor(), RawProcessor()])
+#     def register(self, processor: Processor) -> None:
+#         self.registry.append(processor)
+#     def dispatch(self, input: JobInput) -> Processor:
+#         for processor in self.registry + self.default_registry:
+#             if processor.match(input):
+#                 return processor
+#         # should never be requires, but eh
+#         return RawProcessor()

src/webservice.py CHANGED Viewed

@@ -14,6 +14,12 @@ logger.setLevel(logging.DEBUG)
 app = FastAPI()
 @app.post("/submit/")
 def submit_job(input: RequestInput) -> str:
     # submit a new job, poor man's job queue

 app = FastAPI()
+# status
+@app.get("/status/")
+def status() -> str:
+    return "OK"
 @app.post("/submit/")
 def submit_job(input: RequestInput) -> str:
     # submit a new job, poor man's job queue

src/worker.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import time
 from base import JobInput
 from db import get_db_cursor
-from ml import ProcessorRegistry, Summarizer, Tagger
 SLEEP_INTERVAL = 5
-processor_registry = ProcessorRegistry()
-summarizer = Summarizer()
-tagger = Tagger()
-print("loaded ML models")
 def check_pending_jobs() -> list[JobInput]:
     """Check DB for pending jobs"""
     with get_db_cursor() as cursor:
@@ -30,15 +31,38 @@ def check_pending_jobs() -> list[JobInput]:
     ]
-def store(
-    job: JobInput,
-    *,
-    summary: str,
-    tags: list[str],
-    processor_name: str,
-    summarizer_name: str,
-    tagger_name: str,
-) -> None:
     with get_db_cursor() as cursor:
         # write to entries, summary, tags tables
         cursor.execute(
@@ -46,39 +70,23 @@ def store(
                 "INSERT INTO summaries (entry_id, summary, summarizer_name)"
                 " VALUES (?, ?, ?)"
             ),
-            (job.id, summary, summarizer_name),
         )
         cursor.executemany(
             "INSERT INTO tags (entry_id, tag, tagger_name) VALUES (?, ?, ?)",
-            [(job.id, tag, tagger_name) for tag in tags],
         )
-def process_job(job: JobInput) -> None:
     tic = time.perf_counter()
     print(f"Processing job for (id={job.id[:8]})")
     # care: acquire cursor (which leads to locking) as late as possible, since
     # the processing and we don't want to block other workers during that time
     try:
-        processor = processor_registry.dispatch(job)
-        processor_name = processor.get_name()
-        processed = processor(job)
-        tagger_name = tagger.get_name()
-        tags = tagger(processed)
-        summarizer_name = summarizer.get_name()
-        summary = summarizer(processed)
-        store(
-            job,
-            summary=summary,
-            tags=tags,
-            processor_name=processor_name,
-            summarizer_name=summarizer_name,
-            tagger_name=tagger_name,
-        )
         # update job status to done
         with get_db_cursor() as cursor:
             cursor.execute(
@@ -96,7 +104,40 @@ def process_job(job: JobInput) -> None:
     print(f"Finished processing job (id={job.id[:8]}) in {toc - tic:0.3f} seconds")
 def main() -> None:
     while True:
         jobs = check_pending_jobs()
         if not jobs:
@@ -106,7 +147,7 @@ def main() -> None:
         print(f"Found {len(jobs)} pending job(s), processing...")
         for job in jobs:
-            process_job(job)
 if __name__ == "__main__":

 import time
+from dataclasses import dataclass
 from base import JobInput
 from db import get_db_cursor
+from ml import (
+    DefaultUrlProcessor,
+    HfTransformersSummarizer,
+    HfTransformersTagger,
+    MlRegistry,
+    RawTextProcessor,
+)
 SLEEP_INTERVAL = 5
 def check_pending_jobs() -> list[JobInput]:
     """Check DB for pending jobs"""
     with get_db_cursor() as cursor:
     ]
+@dataclass
+class JobOutput:
+    summary: str
+    tags: list[str]
+    processor_name: str
+    summarizer_name: str
+    tagger_name: str
+def _process_job(job: JobInput, registry: MlRegistry) -> JobOutput:
+    processor = registry.get_processor(job)
+    processor_name = processor.get_name()
+    processed = processor(job)
+    tagger = registry.get_tagger()
+    tagger_name = tagger.get_name()
+    tags = tagger(processed)
+    summarizer = registry.get_summarizer()
+    summarizer_name = summarizer.get_name()
+    summary = summarizer(processed)
+    return JobOutput(
+        summary=summary,
+        tags=tags,
+        processor_name=processor_name,
+        summarizer_name=summarizer_name,
+        tagger_name=tagger_name,
+    )
+def store(job: JobInput, output: JobOutput) -> None:
     with get_db_cursor() as cursor:
         # write to entries, summary, tags tables
         cursor.execute(
                 "INSERT INTO summaries (entry_id, summary, summarizer_name)"
                 " VALUES (?, ?, ?)"
             ),
+            (job.id, output.summary, output.summarizer_name),
         )
         cursor.executemany(
             "INSERT INTO tags (entry_id, tag, tagger_name) VALUES (?, ?, ?)",
+            [(job.id, tag, output.tagger_name) for tag in output.tags],
         )
+def process_job(job: JobInput, registry: MlRegistry) -> None:
     tic = time.perf_counter()
     print(f"Processing job for (id={job.id[:8]})")
     # care: acquire cursor (which leads to locking) as late as possible, since
     # the processing and we don't want to block other workers during that time
     try:
+        output = _process_job(job, registry)
+        store(job, output)
         # update job status to done
         with get_db_cursor() as cursor:
             cursor.execute(
     print(f"Finished processing job (id={job.id[:8]}) in {toc - tic:0.3f} seconds")
+def load_mlregistry(model_name: str) -> MlRegistry:
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    config_summarizer = GenerationConfig.from_pretrained(model_name)
+    config_summarizer.max_new_tokens = 200
+    config_summarizer.min_new_tokens = 100
+    config_summarizer.top_k = 5
+    config_summarizer.repetition_penalty = 1.5
+    config_tagger = GenerationConfig.from_pretrained(model_name)
+    config_tagger.max_new_tokens = 50
+    config_tagger.min_new_tokens = 25
+    # increase the temperature to make the model more creative
+    config_tagger.temperature = 1.5
+    summarizer = HfTransformersSummarizer(model_name, model, tokenizer, config_summarizer)
+    tagger = HfTransformersTagger(model_name, model, tokenizer, config_tagger)
+    registry = MlRegistry()
+    registry.register_processor(DefaultUrlProcessor())
+    registry.register_processor(RawTextProcessor())
+    registry.register_summarizer(summarizer)
+    registry.register_tagger(tagger)
+    return registry
 def main() -> None:
+    model_name = "google/flan-t5-large"
+    registry = load_mlregistry(model_name)
     while True:
         jobs = check_pending_jobs()
         if not jobs:
         print(f"Found {len(jobs)} pending job(s), processing...")
         for job in jobs:
+            process_job(job, registry)
 if __name__ == "__main__":