Spaces:
Runtime error
Runtime error
| import abc | |
| import io | |
| import logging | |
| import re | |
| from typing import Optional | |
| import trafilatura | |
| from httpx import Client | |
| from PIL import Image | |
| from gistillery.base import JobInput | |
| from gistillery.tools import get_agent | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| RE_URL = re.compile(r"(https?://[^\s]+)") | |
| def get_url(text: str) -> str | None: | |
| urls: list[str] = list(RE_URL.findall(text)) | |
| if len(urls) == 1: | |
| url = urls[0] | |
| return url | |
| return None | |
| class Processor(abc.ABC): | |
| def get_name(self) -> str: | |
| return self.__class__.__name__ | |
| def __call__(self, job: JobInput) -> str: | |
| _id = job.id | |
| logger.info(f"Processing {input} with {self.__class__.__name__} (id={_id[:8]})") | |
| result = self.process(job) | |
| logger.info(f"Finished processing input (id={_id[:8]})") | |
| return result | |
| def process(self, input: JobInput) -> str: | |
| raise NotImplementedError | |
| def match(self, input: JobInput) -> bool: | |
| raise NotImplementedError | |
| class RawTextProcessor(Processor): | |
| def match(self, input: JobInput) -> bool: | |
| return True | |
| def process(self, input: JobInput) -> str: | |
| return input.content.strip() | |
| class DefaultUrlProcessor(Processor): | |
| def __init__(self) -> None: | |
| self.client = Client() | |
| self.url = Optional[str] | |
| self.template = "{url}\n\n{content}" | |
| def match(self, input: JobInput) -> bool: | |
| url = get_url(input.content.strip()) | |
| if url is None: | |
| return False | |
| self.url = url | |
| return True | |
| def process(self, input: JobInput) -> str: | |
| """Get content of website and return it as string""" | |
| if not isinstance(self.url, str): | |
| raise TypeError("self.url must be a string") | |
| text = self.client.get(self.url).text | |
| assert isinstance(text, str) | |
| extracted = trafilatura.extract(text) | |
| text = self.template.format(url=self.url, content=extracted) | |
| return str(text) | |
| class ImageUrlProcessor(Processor): | |
| def __init__(self) -> None: | |
| self.client = Client() | |
| self.url = Optional[str] | |
| self.template = "{url}\n\n{content}" | |
| self.image_suffixes = {'jpg', 'jpeg', 'png', 'gif'} | |
| def match(self, input: JobInput) -> bool: | |
| url = get_url(input.content.strip()) | |
| if url is None: | |
| return False | |
| suffix = url.rsplit(".", 1)[-1].lower() | |
| if suffix not in self.image_suffixes: | |
| return False | |
| self.url = url | |
| return True | |
| def process(self, input: JobInput) -> str: | |
| if not isinstance(self.url, str): | |
| raise TypeError("self.url must be a string") | |
| response = self.client.get(self.url) | |
| image = Image.open(io.BytesIO(response.content)).convert('RGB') | |
| caption = get_agent().run("Caption the following image", image=image) | |
| return str(caption) | |