Spaces:

BenjaminB
/

gistillery

Runtime error

App Files Files Community

Benjamin Bossan commited on May 23, 2023

Commit

4c2b75c

1 Parent(s): 111044d

Add pdf processor using pypdf

Browse files

Also, enforce a max length to the processed output.

Files changed (7) hide show

demo.py +3 -0
requests.org +11 -0
requirements.txt +1 -0
src/gistillery/config.py +12 -2
src/gistillery/errors.py +2 -0
src/gistillery/preprocessing.py +61 -1
src/gistillery/registry.py +2 -0

demo.py CHANGED Viewed

@@ -72,6 +72,9 @@ Input currently supports:
 - a URL to a webpage
 - a URL to a youtube video (the video will be transcribed)
 - a URL to an image (url ending in .png, .jpg, etc.; the image description will be used)
 """

 - a URL to a webpage
 - a URL to a youtube video (the video will be transcribed)
 - a URL to an image (url ending in .png, .jpg, etc.; the image description will be used)
+- a URL to a PDF (url ending in .pdf, e.g. https://arxiv.org/pdf/2108.12409.pdf)
+Long inputs will be truncated.
 """

requests.org CHANGED Viewed

@@ -62,6 +62,17 @@ curl -X 'POST' \
 }'
 #+end_src
 #+begin_src bash
 curl -X 'GET' \
   'http://localhost:8080/check_job_status/' \

 }'
 #+end_src
+#+begin_src bash
+curl -X 'POST' \
+  'http://localhost:8080/submit/' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "author": "ben",
+  "content": "https://arxiv.org/pdf/2108.12409.pdf"
+}'
+#+end_src
 #+begin_src bash
 curl -X 'GET' \
   'http://localhost:8080/check_job_status/' \

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ pillow
 gradio
 urllib3
 pytube

 gradio
 urllib3
 pytube
+pypdf

src/gistillery/config.py CHANGED Viewed

@@ -2,14 +2,24 @@ import os
 from pathlib import Path
 from pydantic import BaseSettings
 class Config(BaseSettings):
     hf_hub_token: str = "missing"
     hf_agent: str = "https://api-inference.huggingface.co/models/bigcode/starcoder"
     db_file_name: Path = Path("sqlite-data.db")
-    sampling_rate: int = 16_000  # audio transcription
-    max_yt_length: int = 1800  # in minutes
     class Config:
         # load .env file by default, with provisio to use other .env files if set

 from pathlib import Path
 from pydantic import BaseSettings
+from pydantic.types import PositiveInt
 class Config(BaseSettings):
     hf_hub_token: str = "missing"
     hf_agent: str = "https://api-inference.huggingface.co/models/bigcode/starcoder"
     db_file_name: Path = Path("sqlite-data.db")
+    processing_max_length: PositiveInt = 10000  # in characters
+    sampling_rate: PositiveInt = 16_000  # audio transcription
+    max_yt_length: PositiveInt = 1800  # in minutes
+    pdf_stop_words: list[str] = [
+        "ACKNOWLEDGMENTS",
+        "Acknowledgments",
+        "acknowledgments",
+        "REFERENCES",
+        "References",
+        "references",
+    ]
     class Config:
         # load .env file by default, with provisio to use other .env files if set

src/gistillery/errors.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ class ProcessingError(RuntimeError):
2	+ """Error raised when processing goes wrong"""

src/gistillery/preprocessing.py CHANGED Viewed

@@ -13,6 +13,7 @@ from transformers import AutoProcessor, WhisperForConditionalGeneration
 from gistillery.base import JobInput
 from gistillery.config import get_config
 from gistillery.media import download_yt_audio, load_audio
 from gistillery.tools import get_agent
@@ -32,13 +33,29 @@ def get_url(text: str) -> str | None:
 class Processor(abc.ABC):
     def get_name(self) -> str:
         return self.__class__.__name__
     def __call__(self, job: JobInput) -> str:
         _id = job.id
         logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})")
         result = self.process(job)
         logger.info(f"Finished processing input (id={_id[:8]})")
         return result
@@ -61,6 +78,7 @@ class RawTextProcessor(Processor):
 class DefaultUrlProcessor(Processor):
     def __init__(self) -> None:
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"
@@ -85,8 +103,48 @@ class DefaultUrlProcessor(Processor):
         return str(text)
 class ImageUrlProcessor(Processor):
     def __init__(self) -> None:
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"
@@ -111,13 +169,15 @@ class ImageUrlProcessor(Processor):
         response = self.client.get(self.url)
         image = Image.open(io.BytesIO(response.content)).convert('RGB')
         caption = get_agent().run("Caption the following image", image=image)
-        return str(caption)
 class YoutubeUrlProcessor(Processor):
     """Download yt audio, transcribe with whisper"""
     def __init__(self) -> None:
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"

 from gistillery.base import JobInput
 from gistillery.config import get_config
+from gistillery.errors import ProcessingError
 from gistillery.media import download_yt_audio, load_audio
 from gistillery.tools import get_agent
 class Processor(abc.ABC):
+    def __init__(self) -> None:
+        self.max_length = get_config().processing_max_length
+        self._super_init_called = True
     def get_name(self) -> str:
         return self.__class__.__name__
     def __call__(self, job: JobInput) -> str:
+        if not self._super_init_called:
+            raise RuntimeError(
+                "super().__init__() was not called with class "
+                f"{self.__class__.__name__}"
+            )
         _id = job.id
         logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})")
         result = self.process(job)
+        if len(result) > self.max_length:
+            logger.warning(
+                f"Length of result ({len(result)}) exceeds max_length "
+                f"({self.max_length}), truncating"
+            )
+            result = result[: self.max_length]
         logger.info(f"Finished processing input (id={_id[:8]})")
         return result
 class DefaultUrlProcessor(Processor):
     def __init__(self) -> None:
+        super().__init__()
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"
         return str(text)
+class PdfUrlProcessor(Processor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.client = Client()
+        self.url = Optional[str]
+        self.template = "{url}\n\n{content}"
+        self.stop_words = get_config().pdf_stop_words
+    def match(self, input: JobInput) -> bool:
+        url = get_url(input.content.strip())
+        if url is None:
+            return False
+        suffix = url.rsplit(".", 1)[-1].lower()
+        if suffix != "pdf":
+            return False
+        self.url = url
+        return True
+    def process(self, input: JobInput) -> str:
+        if not isinstance(self.url, str):
+            raise TypeError("self.url must be a string")
+        response = self.client.get(self.url)
+        import pypdf
+        pdf = pypdf.PdfReader(io.BytesIO(response.content))
+        results = []
+        for page in pdf.pages:
+            results.append(page.extract_text())
+            if any(word in results[-1] for word in self.stop_words):
+                break
+        text = "\n".join(results).strip()
+        if not text:
+            raise ProcessingError("No text could be extracted from PDF")
+        return self.template.format(url=self.url, content=text)
 class ImageUrlProcessor(Processor):
     def __init__(self) -> None:
+        super().__init__()
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"
         response = self.client.get(self.url)
         image = Image.open(io.BytesIO(response.content)).convert('RGB')
         caption = get_agent().run("Caption the following image", image=image)
+        text = str(caption)
+        return self.template.format(url=self.url, content=text)
 class YoutubeUrlProcessor(Processor):
     """Download yt audio, transcribe with whisper"""
     def __init__(self) -> None:
+        super().__init__()
         self.client = Client()
         self.url = Optional[str]
         self.template = "{url}\n\n{content}"

src/gistillery/registry.py CHANGED Viewed

@@ -2,6 +2,7 @@ from gistillery.base import JobInput
 from gistillery.preprocessing import (
     DefaultUrlProcessor,
     ImageUrlProcessor,
     Processor,
     RawTextProcessor,
     YoutubeUrlProcessor,
@@ -59,6 +60,7 @@ def get_tool_registry() -> ToolRegistry:
     _registry = ToolRegistry()
     _registry.register_processor(YoutubeUrlProcessor())
     _registry.register_processor(ImageUrlProcessor())
     _registry.register_processor(DefaultUrlProcessor())
     _registry.register_processor(RawTextProcessor())

 from gistillery.preprocessing import (
     DefaultUrlProcessor,
     ImageUrlProcessor,
+    PdfUrlProcessor,
     Processor,
     RawTextProcessor,
     YoutubeUrlProcessor,
     _registry = ToolRegistry()
     _registry.register_processor(YoutubeUrlProcessor())
+    _registry.register_processor(PdfUrlProcessor())
     _registry.register_processor(ImageUrlProcessor())
     _registry.register_processor(DefaultUrlProcessor())
     _registry.register_processor(RawTextProcessor())