Spaces:

veanu
/

spock

Sleeping

App Files Files Community

jaegglic commited on Jan 24, 2025

Commit

c5e4363

1 Parent(s): 1be5cb1

Initial commit

Browse files

Files changed (22) hide show

README.md +4 -4
vianu/__init__.py +71 -0
vianu/spock/Dockerfile +24 -0
vianu/spock/__init__.py +0 -0
vianu/spock/__main__.py +158 -0
vianu/spock/app/__init__.py +3 -0
vianu/spock/app/app.py +592 -0
vianu/spock/app/formatter.py +98 -0
vianu/spock/assets/css/styles.css +191 -0
vianu/spock/assets/head/scripts.html +11 -0
vianu/spock/assets/images/favicon.png +0 -0
vianu/spock/assets/images/spock_logo.png +0 -0
vianu/spock/assets/images/spock_logo_circular.png +0 -0
vianu/spock/launch_demo_app.py +18 -0
vianu/spock/launch_demo_pipeline.py +21 -0
vianu/spock/requirements.txt +9 -0
vianu/spock/settings.py +32 -0
vianu/spock/src/__init__.py +0 -0
vianu/spock/src/base.py +277 -0
vianu/spock/src/cli.py +33 -0
vianu/spock/src/ner.py +221 -0
vianu/spock/src/scraping.py +922 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Spock
-emoji: 📈
-colorFrom: purple
-colorTo: gray
 sdk: docker
 pinned: false
 short_description: Spotting clinical knowledge

 ---
+title: SpoCK
+emoji: 🖖
+colorFrom: yellow
+colorTo: green
 sdk: docker
 pinned: false
 short_description: Spotting clinical knowledge

vianu/__init__.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, List
+import gradio as gr
+from gradio.events import Dependency
+LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
+class BaseApp(ABC):
+    """The abstract base class of the main gradio application."""
+    def __init__(
+            self,
+            app_name: str | None = None,
+            favicon_path: Path | None = None,
+            allowed_paths: List[str] | None = None,
+            head_file: Path | None = None,
+            css_file: Path | None = None,
+            theme: gr.Theme | None = None,
+            local_state: Any | None = None,
+            session_state: Any | None = None,
+        ):
+        """
+        Args:
+            app_name: The name of the application. Defaults to None.
+            favicon_path: The favicon file as a :class:`pathlib.Path`. Defaults to None.
+            head_file: Custom html code as a :class:`pathlib.Path` to a html file. Defaults to None.
+            css_file (Path, optional): Custom css as a :class:`pathlib.Path` to a css file. Defaults to None.
+            theme (gr.Theme, optional): The theme of the application. Defaults to None.
+            local_state (Any, optional): The local state, where data persists in the browser's localStorage even after the page is refreshed or closed. Should be a json-serializable value (accessible only through it's serialized form). Defaults to None.
+            session_state (Any, optional): The session state, where data persists across multiple submits within a page session. Defaults to None
+        """
+        self.favicon_path = favicon_path
+        self.allowed_paths = allowed_paths
+        self._app_name = app_name
+        self._head_file = head_file
+        self._css_file = css_file
+        self._theme = theme
+        self._local_state = gr.BrowserState(local_state)
+        self._session_state = gr.State(session_state)
+    @abstractmethod
+    def setup_ui(self):
+        """Set up the user interface."""
+        pass
+    @abstractmethod
+    def register_events(self):
+        """Register the events."""
+        pass
+    def make(self) -> Dependency:
+        with gr.Blocks(
+            title=self._app_name,
+            head_paths=self._head_file,
+            css_paths=self._css_file,
+            theme=self._theme,
+        ) as demo:
+            self._local_state.render()
+            self._session_state.render()
+            self.setup_ui()
+            self.register_events()
+            demo.load()
+        return demo

vianu/spock/Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.11-slim
+# Set environment variables to prevent Python from writing .pyc files and to buffer stdout and stderr
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Set the working directory inside the container
+WORKDIR /app
+ENV PYTHONPATH=/app
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+# Copy the files into the working directory
+COPY ../../vianu/__init__.py /app/vianu/__init__.py
+COPY ../../vianu/spock /app/vianu/spock
+# Install dependencies from requirements.txt
+RUN pip install --upgrade pip \
+    && pip install -r vianu/spock/requirements.txt
+# Expose the port your Gradio app will run on (default: 7860)
+EXPOSE 7868
+# Command to run the application
+CMD ["python", "vianu/spock/launch_demo_app.py"]

vianu/spock/__init__.py ADDED Viewed

File without changes

vianu/spock/__main__.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from argparse import Namespace
+import asyncio
+from datetime import datetime
+import logging
+import sys
+from typing import List, Tuple
+from vianu import LOG_FMT
+from vianu.spock.settings import SCRAPING_SOURCES, LOG_LEVEL
+from vianu.spock.src.cli import parse_args
+from vianu.spock.src.base import Setup, Document, SpoCK, FileHandler
+from vianu.spock.src import scraping as scp
+from vianu.spock.src import ner
+logging.basicConfig(format=LOG_FMT, level=LOG_LEVEL)
+logger = logging.getLogger(__name__)
+async def _orchestrator(
+        args_: Namespace,
+        src_queue: asyncio.Queue,
+        scp_queue: asyncio.Queue,
+        ner_queue: asyncio.Queue,
+        scp_tasks: List[asyncio.Task],
+        ner_tasks: List[asyncio.Task],
+    ) -> None:
+    """Orchestrates the scraping and NER tasks.
+    It waits for all scraping tasks to finish, then sends a sentinel to the scp_queue for each ner task (which will
+    trigger the ner tasks to finish -> cf :func:`vianu.spock.src.ner.apply`).
+    """
+    logger.debug('setting up orchestrator task')
+    # Insert sources into the source queue
+    sources = args_.source
+    for src in sources:
+        await src_queue.put(src)
+    # Insert sentinel for each scraping task
+    for _ in range(len(scp_tasks)):
+        await src_queue.put(None)
+    # Wait for all scraper tasks to finish and stop them
+    await src_queue.join()
+    try:
+        await asyncio.gather(*scp_tasks)
+    except asyncio.CancelledError:
+        logger.warning('scraping task(s) have previously been canceled')
+    except Exception as e:
+        logger.error(f'scraping task(s) failed with error: {e}')
+        raise e
+    for st in scp_tasks:
+        st.cancel()
+    # Insert sentinel for each NER
+    for _ in range(len(ner_tasks)):
+        await scp_queue.put(None)
+    # Wait for NER tasks to process all items and finish
+    await scp_queue.join()
+    try:
+        await asyncio.gather(*ner_tasks)
+    except asyncio.CancelledError:
+        logger.warning('ner task(s) have previously been canceled')
+    except Exception as e:
+        logger.error(f'ner task(s) failed with error: {e}')
+        raise e
+    for nt in ner_tasks:
+        nt.cancel()
+    # Insert sentinel into ner_queue to indicate end of processing
+    await ner_queue.put(None)
+def setup_asyncio_framework(args_: Namespace) -> Tuple[asyncio.Queue, List[asyncio.Task], List[asyncio.Task], asyncio.Task]:
+    """Set up the asyncio framework for the SpoCK application."""
+    # Set up arguments
+    if args_.source is None:
+        args_.source = SCRAPING_SOURCES
+    # Set up queues
+    src_queue = asyncio.Queue()
+    scp_queue = asyncio.Queue()
+    ner_queue = asyncio.Queue()
+    # Start tasks
+    scp_tasks = scp.create_tasks(args_=args_, queue_in=src_queue, queue_out=scp_queue)
+    ner_tasks = ner.create_tasks(args_=args_, queue_in=scp_queue, queue_out=ner_queue)
+    orc_task = asyncio.create_task(
+        _orchestrator(
+            args_=args_,
+            src_queue=src_queue,
+            scp_queue=scp_queue,
+            ner_queue=ner_queue,
+            scp_tasks=scp_tasks,
+            ner_tasks=ner_tasks,
+        )
+    )
+    return ner_queue, scp_tasks, ner_tasks, orc_task
+async def _collector(ner_queue: asyncio.Queue) -> List[Document]:
+    """Collect results from the NER queue."""
+    data = []
+    while True:
+        item = await ner_queue.get()
+        # Check stopping condition
+        if item is None:
+            ner_queue.task_done()
+            break
+        # Append document to data
+        data.append(item.doc)
+        ner_queue.task_done()
+    return data
+async def main(args_: Namespace | None = None, save: bool = True) -> None:
+    """Main function for the SpoCK pipeline."""
+    started_at = datetime.now()
+    if args_ is None:
+        args_= parse_args(sys.argv[1:])
+    logging.basicConfig(level=args_.log_level.upper(), format=LOG_FMT)
+    logger.info(f'starting SpoCK (args_={args_})')
+    # Set up async structure (scraping queue/tasks, NER queue/tasks, orchestrator task)
+    ner_queue, _, _, _ = setup_asyncio_framework(args_)
+    # Set up collector task and wait for it to finish
+    # NOTE: if collector task is finished, the orchestrator is also finished (because of the sentinel in `ner_queue`)
+    # and therefore so are the scraping and NER tasks
+    col_task = asyncio.create_task(_collector(ner_queue))
+    data = await col_task
+    await ner_queue.join()
+    # Save data
+    if save:
+        file_name = args_.file_name
+        file_path = args_.file_path
+        spock = SpoCK(
+            id_=str(args_),
+            status='completed',
+            started_at=started_at,
+            finished_at=datetime.now(),
+            setup=Setup.from_namespace(args_),
+            data=data,
+        )
+        if file_name is not None and file_path is not None:
+            FileHandler(file_path=file_path).write(file_name=file_name, spock=spock)
+    logger.info('finished SpoCK')
+if __name__ == '__main__':
+    asyncio.run(main())

vianu/spock/app/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from vianu.spock.app.app import App
2	+
3	+ __all__ = ['App']

vianu/spock/app/app.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import asyncio
+from dataclasses import dataclass, field
+from datetime import datetime
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+from dotenv import load_dotenv
+import gradio as gr
+from vianu.spock.settings import LOG_LEVEL, N_SCP_TASKS, N_NER_TASKS
+from vianu.spock.settings import LARGE_LANGUAGE_MODELS, SCRAPING_SOURCES, MAX_DOCS_SRC
+from vianu.spock.settings import GRADIO_APP_NAME, GRADIO_SERVER_PORT, GRADIO_MAX_JOBS, GRADIO_UPDATE_INTERVAL
+from vianu.spock.settings import OLLAMA_BASE_URL_ENV_NAME, OPENAI_API_KEY_ENV_NAME
+from vianu.spock.src.base import Setup, SpoCK, SpoCKList, QueueItem         # noqa: F401
+from vianu import BaseApp
+from vianu.spock.__main__ import setup_asyncio_framework
+import vianu.spock.app.formatter as fmt
+logger = logging.getLogger(__name__)
+load_dotenv()
+# App settings
+_ASSETS_PATH = Path(__file__).parents[1] / "assets"
+_UI_SETTINGS_LLM_CHOICES = [(name, value) for name, value in zip(['Ollama', 'OpenAI'], LARGE_LANGUAGE_MODELS)]
+_UI_SETTINGS_SOURCE_CHOICES = [(name, value) for name, value in zip(['PubMed', 'EMA', 'MHRA', 'FDA'], SCRAPING_SOURCES)]
+if not len(_UI_SETTINGS_LLM_CHOICES) == len(LARGE_LANGUAGE_MODELS):
+    raise ValueError('LARGE_LANGUAGE_MODELS and _UI_SETTINGS_LLM_CHOICES must have the same length')
+if not len(_UI_SETTINGS_SOURCE_CHOICES) == len(SCRAPING_SOURCES):
+    raise ValueError('SCRAPING_SOURCES and _UI_SETTINGS_SOURCE_CHOICES must have the same length')
+@dataclass
+class LocalState:
+    """The persistent local state."""
+    log_level: str = LOG_LEVEL
+    n_scp_tasks: int = N_SCP_TASKS
+    n_ner_tasks: int = N_NER_TASKS
+    max_jobs: int = GRADIO_MAX_JOBS
+    update_interval: float = GRADIO_UPDATE_INTERVAL
+@dataclass
+class SessionState:
+    """The session dependent state."""
+    # Asyncio setup
+    ner_queue: asyncio.Queue | None = None
+    scp_tasks: List[asyncio.Task] = field(default_factory=list)
+    ner_tasks: List[asyncio.Task] = field(default_factory=list)
+    orc_task: asyncio.Task | None = None
+    col_task: asyncio.Task | None = None
+    # Data
+    is_running: bool = False
+    spocks: SpoCKList = field(default_factory=list)
+    _index_running_spock: int | None = None
+    _index_active_spock: int | None = None
+    def set_running_spock(self, index: int | None) -> None:
+        self._index_running_spock = index
+    def get_running_spock(self) -> SpoCK:
+        return self.spocks[self._index_running_spock]
+    def set_active_spock(self, index: int | None) -> None:
+        self._index_active_spock = index
+    def get_active_spock(self) -> SpoCK:
+        return self.spocks[self._index_active_spock]
+class App(BaseApp):
+    """The main gradio application."""
+    def __init__(self):
+        super().__init__(
+            app_name=GRADIO_APP_NAME,
+            favicon_path=_ASSETS_PATH / "images" / "favicon.png",
+            allowed_paths=[str(_ASSETS_PATH.resolve())],
+            head_file=_ASSETS_PATH / "head" / "scripts.html",
+            css_file=_ASSETS_PATH / "css" / "styles.css",
+            theme=gr.themes.Soft(),
+            local_state=LocalState(),
+            session_state=SessionState(),
+        )
+        self._components: Dict[str, Any] = {}
+    # --------------------------------------------------------------------------
+    # User Interface
+    # --------------------------------------------------------------------------
+    @staticmethod
+    def _ui_top_row():
+        with gr.Row(elem_classes="top-row"):
+            with gr.Column(scale=1):
+                gr.Image(
+                    value=_ASSETS_PATH / "images" / "spock_logo_circular.png",
+                    show_label=False,
+                    elem_classes="image",
+                )
+            with gr.Column(scale=5):
+                value = """<div class='top-row title-desc'>
+                  <div class='top-row title-desc title'>SpoCK: Spotting Clinical Knowledge</div>
+                  <div class='top-row title-desc desc'><em>A tool for identifying <b>medicinal products</b> and <b>adverse drug reactions</b> inside publicly available literature</em></div>
+                </div>
+                """
+                gr.Markdown(value=value)
+    def _ui_corpus_settings(self):
+        """Settings column."""
+        with gr.Column(scale=1):
+            with gr.Accordion(label='LLM Endpoint'):
+                self._components['settings.llm_radio'] = gr.Radio(
+                    label='Model', show_label=False, choices=_UI_SETTINGS_LLM_CHOICES, value='llama', interactive=True
+                )
+                # 'llama' specific settings
+                with gr.Group(visible=True) as self._components['settings.ollama_group']:
+                    value = os.environ.get("OLLAMA_BASE_URL")
+                    placeholder = 'base_url of ollama endpoint' if value is None else None
+                    gr.Markdown('---')
+                    self._components['settings.ollama_base_url'] = gr.Textbox(
+                        label='base_url',
+                        show_label=False,
+                        info='base_url',
+                        placeholder=placeholder,
+                        value=value,
+                        interactive=True,
+                    )
+                # 'openai' specific settings
+                with gr.Group(visible=False) as self._components['settings.openai_group']:
+                    value = os.environ.get("OPENAI_API_KEY")
+                    placeholder = 'api_key of openai endpoint' if value is None else None
+                    logger.debug(f'openai api_key={value}')
+                    gr.Markdown('---')
+                    self._components['settings.openai_api_key'] = gr.Textbox(
+                        label='api_key',
+                        show_label=False,
+                        info='api_key',
+                        placeholder=placeholder,
+                        value=value,
+                        interactive=True,
+                        type='password',
+                    )
+            with gr.Accordion(label='Sources', open=True):
+                self._components['settings.source'] = gr.CheckboxGroup(
+                    label='Sources', show_label=False, choices=_UI_SETTINGS_SOURCE_CHOICES, value=SCRAPING_SOURCES, interactive=True
+                )
+                self._components['settings.max_docs_src'] = gr.Number(
+                    label='max_docs_src', show_label=False, info='max. number of documents per source', value=MAX_DOCS_SRC, interactive=True
+                )
+    def _ui_corpus_row(self):
+        """Main corpus with settings, search field, job cards, and details"""
+        with gr.Row(elem_classes="bottom-container"):
+            self._ui_corpus_settings()
+            self._ui_corpus_main()
+    def _ui_corpus_main(self):
+        """Search field, job cards, and details."""
+        with gr.Column(scale=5):
+            # Search text field and start/stop/cancel buttons
+            with gr.Row(elem_classes="search-container"):
+                with gr.Column(scale=3):
+                    self._components['main.search_term'] = gr.Textbox(
+                        label="Search", show_label=False, placeholder="Enter your search term"
+                    )
+                with gr.Column(scale=1, elem_classes='pipeline-button'):
+                    self._components['main.start_button'] = gr.HTML('<div class="button-not-running">Start</div>', visible=True)
+                    self._components['main.stop_button'] = gr.HTML('<div class="button-running">Stop</div>', visible=False)
+                    self._components['main.cancel_button'] = gr.HTML('<div class="canceling">canceling...</div>', visible=False)
+            # Job summary cards
+            with gr.Row(elem_classes="jobs-container"):
+                self._components['main.cards'] = [gr.HTML('', elem_id=f'job-{i}', visible=False) for i in range(GRADIO_MAX_JOBS)]
+            # Details of the selected job
+            with gr.Row():
+                self._components['main.details'] = gr.HTML('<div class="details-container"></div>')
+    def setup_ui(self):
+        """Set up the user interface."""
+        self._ui_top_row()
+        self._ui_corpus_row()
+        self._components['timer'] = gr.Timer(value=GRADIO_UPDATE_INTERVAL, active=False, render=True)
+    # --------------------------------------------------------------------------
+    # Helpers
+    # --------------------------------------------------------------------------
+    @staticmethod
+    def _show_llm_settings(llm: str) -> Tuple[dict[str, Any], dict[str, Any]]:
+        logger.debug(f'show {llm} model settings')
+        if llm == 'llama':
+            return gr.update(visible=True), gr.update(visible=False)
+        elif llm == 'openai':
+            return gr.update(visible=False), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False)
+    @staticmethod
+    def _set_ollama_base_url(base_url: str) -> None:
+        """Setup ollama base_url as environment variable."""
+        logger.debug(f'set ollama base_url environment variable ({OLLAMA_BASE_URL_ENV_NAME}={base_url})')
+        os.environ[OLLAMA_BASE_URL_ENV_NAME] = base_url
+    @staticmethod
+    def _set_openai_api_key(api_key: str) -> None:
+        """Setup openai api_key as environment variable."""
+        log_key = '*****' if api_key else 'None'
+        logger.debug(f'set openai api key (api_key={log_key})')
+        os.environ[OPENAI_API_KEY_ENV_NAME] = api_key
+    @staticmethod
+    def _feed_cards_to_ui(local_state: dict, session_state: SessionState) -> List[dict[str, Any]]:
+        """For all existing SpoCKs, create and feed the job cards to the UI."""
+        spocks = session_state.spocks
+        logger.debug(f'feeding cards to UI (len(spocks)={len(spocks)})')
+        # Create the job cards for the existing spocks
+        cds = []
+        for i, spk in enumerate(spocks):
+            html = fmt.get_job_card_html(i, spk)
+            cds.append(gr.update(value=html, visible=True))
+        # Extdend with not-visible cards
+        # Note: for gradio >= 5.0.0 this logic could be replaces with dynamic number of gr.Blocks
+        # (see https://www.gradio.app/guides/dynamic-apps-with-render-decorator)
+        cds.extend([gr.update(visible=False) for _ in range(local_state['max_jobs'] - len(spocks))])
+        return cds
+    @staticmethod
+    def _feed_details_to_ui(session_state: SessionState) -> str:
+        """Collect the html texts for the documents of the selected job and feed them to the UI."""
+        if len(session_state.spocks) == 0:
+            return fmt.get_details_html([])
+        active_spock = session_state.get_active_spock()
+        logger.debug(f'feeding details to UI (len(data)={len(active_spock.data)})')
+        return fmt.get_details_html(active_spock.data)
+    @staticmethod
+    def _check_llm_settings(llm: str) -> None:
+        """Check if the LLM settings are set."""
+        if llm == 'llama':
+            if os.environ.get(OLLAMA_BASE_URL_ENV_NAME) is None:
+                raise gr.Error('Ollama base_url is not set (submit value with Enter)')
+        elif llm == 'openai':
+            if os.environ.get(OPENAI_API_KEY_ENV_NAME) is None:
+                raise gr.Error('OpenAI api_key is not set (submit value with Enter)')
+    @staticmethod
+    def _toggle_button(session_state: SessionState) -> Tuple[SessionState, dict[str, Any], dict[str, Any], dict[str, Any]]:
+        """Toggle the state of the pipleline between running <-> not_running.
+        As a result the corresponding buttons (Start, Stop, canceling...) are shown/hidden.
+        """
+        logger.debug(f'toggle button (is_running={session_state.is_running}->{not session_state.is_running})')
+        session_state.is_running = not session_state.is_running
+        if session_state.is_running:
+            # Show the stop button and hide the start/cancel button
+            return session_state, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+        else:
+            # Show the start button and hide the stop/cancel button
+            return session_state, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+    @staticmethod
+    def _show_cancel_button() -> Tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
+        """Shows the cancel button and hides the start and stop button."""
+        logger.debug('show cancel button')
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
+    @staticmethod
+    def _setup_spock(term: str, model: str, source: List[str], max_docs_src: int, local_state: dict, session_state: SessionState) -> SessionState:
+        """Setup a new SpoCK object."""
+        max_jobs = local_state['max_jobs']
+        spocks = session_state.spocks
+        # Check if the maximum number of jobs is reached and pop the last job if necessary
+        if len(spocks) >= max_jobs:
+            msg = f'max number of jobs ({max_jobs}); last job "{spocks[-1].setup.term}" is removed'
+            gr.Warning(msg)
+            logger.warning(msg)
+            spocks.pop(-1)
+        # Setup the running_spock and append it to the list of spocks
+        msg = f'started SpoCK for "{term}"'
+        gr.Info(msg)
+        logger.info(msg)
+        # Create new SpoCK object
+        setup = Setup(
+            id_=f'{term} {source} {model}',
+            term=term,
+            model=model,
+            source=source,
+            max_docs_src=max_docs_src,
+            log_level=local_state['log_level'],
+            n_scp_tasks=local_state['n_scp_tasks'],
+            n_ner_tasks=local_state['n_ner_tasks'],
+        )
+        spock = SpoCK(
+            id_=setup.id_,
+            status='running',
+            setup=setup,
+            started_at=setup.submission,
+            data=[]
+        )
+        # Set the running and focused SpoCK to be the new SpoCK object
+        index = 0
+        spocks.insert(index, spock)
+        session_state.set_running_spock(index=index)
+        session_state.set_active_spock(index=index)
+        return session_state
+    @staticmethod
+    async def _collector(session_state: SessionState) -> None:
+        """Append the processed document(s) from the `session_state.ner_queue` to the running spock."""
+        running_spock = session_state.get_running_spock()
+        ner_queue = session_state.ner_queue
+        logger.debug(f'starting collector (term={running_spock.setup.term})')
+        while True:
+            item = await ner_queue.get()    # type: QueueItem
+            # Check stopping condition (added by the `orchestrator` in `vianu.spock.__main__`)
+            if item is None:
+                ner_queue.task_done()
+                break
+            running_spock.data.append(item.doc)
+            ner_queue.task_done()
+    async def _setup_asyncio_framework(self, session_state: SessionState) -> SessionState:
+        """"Start the SpoCK processes by setting up the asyncio framework and starting the asyncio tasks.
+        Main components of asyncio framework are:
+        - ner_queue: queue for collecting results from named entity recognition tasks
+        - scp_tasks: scraping tasks (cf. `vianu.spock.src.scp`)
+        - ner_tasks: named entity recognition tasks (cf. `vianu.spock.src.ner`)
+        - orc_task: orchestrating the process
+        - col_task: collect and assemble the final results
+        """
+        logger.info("setting up asyncio framework")
+        # Setup asyncio tasks as in `vianu.spock.__main__`
+        args_ = session_state.get_running_spock().setup.to_namespace()
+        ner_queue, scp_tasks, ner_tasks, orc_task = setup_asyncio_framework(args_=args_)
+        session_state.ner_queue = ner_queue
+        session_state.scp_tasks = scp_tasks
+        session_state.ner_tasks = ner_tasks
+        session_state.orc_task = orc_task
+        # Setup the app specific collection task
+        col_task = asyncio.create_task(self._collector(session_state=session_state))
+        session_state.col_task = col_task
+        return session_state
+    @staticmethod
+    async def _conclusion(session_state: SessionState) -> SessionState:
+        # Wait collector task to finish and join ner_queue
+        try:
+            await session_state.col_task
+        except asyncio.CancelledError:
+            logger.warning('collector task canceled')
+            return session_state   # This stops the _conclusion step in the case the _canceling step was triggered
+        except Exception as e:
+            logger.error(f'collector task failed with error: {e}')
+            raise e
+        await session_state.ner_queue.join()
+        # Update the running_spock with the final data
+        running_spock = session_state.get_running_spock()
+        running_spock.status = 'completed'
+        running_spock.finished_at = datetime.now()
+        # Log the conclusion and update/empty the running_spock
+        gr.Info(f'job "{running_spock.setup.term}" finished')
+        logger.info(f'job "{running_spock.setup.term}" finished in {running_spock.runtime()}')
+        return session_state
+    @staticmethod
+    async def _canceling(session_state: SessionState) -> SessionState:
+        """Cancel all running :class:`asyncio.Task`."""
+        running_spock = session_state.get_running_spock()
+        gr.Info(f'canceled SpoCK for "{running_spock.setup.term}"')
+        # Update the running_spock
+        running_spock.status = 'stopped'
+        running_spock.finished_at = datetime.now()
+        # Cancel scraping tasks
+        logger.warning("canceling scraping tasks")
+        for task in session_state.scp_tasks:
+            task.cancel()
+        await asyncio.gather(*session_state.scp_tasks, return_exceptions=True)
+        # Cancel named entity recognition tasks
+        logger.warning("canceling named entity recognition tasks")
+        for task in session_state.ner_tasks:
+            task.cancel()
+        await asyncio.gather(*session_state.ner_tasks, return_exceptions=True)
+        # Cancel orchestrator task
+        logger.warning("canceling orchestrator task")
+        session_state.orc_task.cancel()
+        await asyncio.gather(session_state.orc_task, return_exceptions=True)    # we use return_exceptions=True to avoid raising exceptions due to the subtasks being allready canceled`
+        # Cancel collector task
+        logger.warning("canceling collector task")
+        session_state.col_task.cancel()
+        await asyncio.gather(session_state.col_task, return_exceptions=True)    # see remark above
+        return session_state
+    @staticmethod
+    def _change_active_spock_number(session_state: SessionState, index: int) -> SessionState:
+        logger.debug(f'card clicked={index}')
+        session_state.set_active_spock(index=index)
+        return session_state
+    # --------------------------------------------------------------------------
+    # Events
+    # --------------------------------------------------------------------------
+    def _event_timer(self):
+        self._components['timer'].tick(
+            fn=self._feed_cards_to_ui,
+            inputs=[self._local_state, self._session_state],
+            outputs=self._components['main.cards'],
+        ).then(
+            fn=self._feed_details_to_ui,
+            inputs=[self._session_state],
+            outputs=self._components['main.details'],
+        )
+    def _event_choose_llm(self):
+        """Choose LLM model show the correspoding settings."""
+        self._components['settings.llm_radio'].change(
+            fn=self._show_llm_settings,
+            inputs=self._components['settings.llm_radio'],
+            outputs=[
+                self._components['settings.ollama_group'],
+                self._components['settings.openai_group'],
+            ],
+        )
+    def _event_settings_ollama(self):
+        """Callback of the ollama settings."""
+        self._components['settings.ollama_base_url'].submit(
+            fn=self._set_ollama_base_url,
+            inputs=self._components['settings.ollama_base_url'],
+        )
+    def _event_settings_openai(self):
+        """Callback of the openai settings."""
+        self._components['settings.openai_api_key'].submit(
+            fn=self._set_openai_api_key,
+            inputs=self._components['settings.openai_api_key'],
+        )
+    def _event_start_spock(self) -> None:
+        search_term = self._components['main.search_term']
+        start_button = self._components['main.start_button']
+        timer = self._components['timer']
+        gr.on(
+            triggers=[search_term.submit, start_button.click],
+            fn=self._check_llm_settings,
+            inputs=self._components['settings.llm_radio'],
+        ).success(
+            fn=self._toggle_button,
+            inputs=self._session_state,
+            outputs=[
+                self._session_state,
+                self._components['main.start_button'],
+                self._components['main.stop_button'],
+                self._components['main.cancel_button'],
+            ]
+        ).then(
+            fn=self._setup_spock,
+            inputs=[
+                search_term,
+                self._components['settings.llm_radio'],
+                self._components['settings.source'],
+                self._components['settings.max_docs_src'],
+                self._local_state,
+                self._session_state,
+            ],
+            outputs=self._session_state
+        ).then(
+            fn=self._setup_asyncio_framework,
+            inputs=self._session_state,
+            outputs=self._session_state,
+        ).then(
+            fn=lambda: None, outputs=search_term    # Empty the search term in the UI
+        ).then(
+            fn=self._feed_cards_to_ui,
+            inputs=[self._local_state, self._session_state],
+            outputs=self._components['main.cards'],
+        ).then(
+            fn=lambda: gr.update(active=True), outputs=timer
+        ).then(
+            fn=self._conclusion,
+            inputs=self._session_state,
+            outputs=self._session_state,
+        ).then(
+            fn=self._feed_cards_to_ui,
+            inputs=[self._local_state, self._session_state],
+            outputs=self._components['main.cards'],
+        ).then(
+            fn=self._feed_details_to_ui,            # called one more time in order to enforce update of the details (regardless of the state of the timer)
+            inputs=[self._session_state],
+            outputs=self._components['main.details'],
+        ).then(
+            fn=lambda: gr.update(active=False), outputs=timer
+        ).then(
+            fn=self._toggle_button,
+            inputs=self._session_state,
+            outputs=[
+                self._session_state,
+                self._components['main.start_button'],
+                self._components['main.stop_button'],
+                self._components['main.cancel_button'],
+            ]
+        )
+    def _event_stop_spock(self):
+        # NOTE: when `stop_button.click` is triggered, the above pipeline (started by `search_term.submit` or
+        # `start_button.click`) is still running and awaiting the `_conclusion` step to finish. The `stop_button.click`
+        # event will cause the `_conclusion` step to terminate, after which the subsequent steps will still be executed;
+        # -> therefore, there is no need to add these steps here.
+        self._components['main.stop_button'].click(
+            fn=self._show_cancel_button,
+            outputs=[
+                self._components['main.start_button'],
+                self._components['main.stop_button'],
+                self._components['main.cancel_button'],
+            ]
+        ).then(
+            fn=self._canceling,
+            inputs=self._session_state,
+            outputs=self._session_state,
+        )
+    def _event_card_click(self):
+        for index, crd in enumerate(self._components['main.cards']):
+            crd.click(
+                fn=self._change_active_spock_number,
+                inputs=[self._session_state, gr.Number(value=index, visible=False)],
+                outputs=self._session_state,
+            ).then(
+                fn=self._feed_details_to_ui,
+                inputs=[self._session_state],
+                outputs=self._components['main.details'],
+            )
+    def register_events(self):
+        """Register the events."""
+        # Setup timer for feed cards and details
+        self._event_timer()
+        # Settings events
+        self._event_choose_llm()
+        self._event_settings_ollama()
+        self._event_settings_openai()
+        # Start/Stop events
+        self._event_start_spock()
+        self._event_stop_spock()
+        # Card click events for showing details
+        self._event_card_click()
+if __name__ == "__main__":
+    from vianu.spock.app import App
+    app = App()
+    demo = app.make()
+    demo.queue().launch(
+        favicon_path=app.favicon_path,
+        inbrowser=True,
+        allowed_paths=[
+            str(_ASSETS_PATH.resolve()),
+        ],
+        server_port=GRADIO_SERVER_PORT,
+    )

vianu/spock/app/formatter.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import logging
+from typing import List
+from vianu.spock.src.base import Document, SpoCK
+from vianu.spock.settings import DATE_FORMAT
+logger = logging.getLogger(__name__)
+JOBS_CONTAINER_CARD_TEMPLATE = """
+<div class="card" onclick="cardClickHandler(this)">
+  <div class="title">{title} {status}</div>
+  <div class="info">Date: {date}</div>
+  <div class="info">Sources: {sources}</div>
+  <div class="info">#docs: {n_doc} | #adr: {n_adr}</div>
+</div>
+"""
+DETAILS_CONTAINER_TEMPLATE = """
+<div id='details' class='details-container'>
+  <div class='items'>{items}</div>
+</div>
+"""
+DETAILS_CONTAINER_ITEM_TEMPLATE = """
+<div class='item'>
+  <div class='top'>
+    <div class='favicon'><img src='{favicon}' alt='Favicon'></div>
+    <div class='title'><a href='{url}'>{title}</a></div>
+  </div>
+  <div class='bottom'>
+    {text}
+  </div>
+</div>
+"""
+def _get_details_html_items(data: List[Document]):
+    """Get the HTML items for the details container. Each item contains the favicon, title, and the text with the
+    highlighted named entities.
+    """
+    items = []
+    max_title_lenth = 120
+    for doc in data:
+        items.append(
+            DETAILS_CONTAINER_ITEM_TEMPLATE.format(
+                favicon=doc.source_favicon_url,
+                url=doc.url,
+                title=doc.title[:max_title_lenth]
+                + ("..." if len(doc.title) > max_title_lenth else ""),
+                text=doc.get_html(),
+                details="details",
+            )
+        )
+    return "\n".join(items)
+def get_details_html(data: List[Document]):
+    """Get the stacked HTML items for each document."""
+    if len(data) == 0:
+        return "<div>no results available (yet)</div>"
+    sorted_data = sorted(data, key=lambda x: (len(x.adverse_reactions), len(x.medicinal_products)), reverse=True)
+    items = _get_details_html_items(data=sorted_data)
+    return DETAILS_CONTAINER_TEMPLATE.format(items=items)
+def _get_status_html(status: str) -> str:
+    """Get the HTML for the status."""
+    if status == "running":
+        return f"<span class='running'>({status.upper()})</span>"
+    elif status == "completed":
+        return f"<span class='completed'>({status.upper()})</span>"
+    elif status == "stopped":
+        return f"<span class='stopped'>({status.upper()})</span>"
+    else:
+        logger.error(f"unknown status: {status.upper()})")
+        return '<span>(status unknown)</span>'
+def get_job_card_html(card_nmbr: int, spock: SpoCK):
+    """Get the HTML for the job card."""
+    job = spock.setup
+    data = spock.data
+    title = spock.setup.term
+    status = _get_status_html(spock.status)
+    sources = ", ".join(job.source)
+    date = job.submission.strftime(DATE_FORMAT)
+    n_doc = len(data)
+    n_adr = sum([len(d.adverse_reactions) for d in data])
+    return JOBS_CONTAINER_CARD_TEMPLATE.format(
+        nmbr=card_nmbr,
+        title=title,
+        status=status,
+        date=date,
+        sources=sources,
+        n_doc=n_doc,
+        n_adr=n_adr,
+    )

vianu/spock/assets/css/styles.css ADDED Viewed

	@@ -0,0 +1,191 @@

+/* no footer */
+footer {
+    display: none !important;
+}
+/* customize scrollbar in gr.Dataframe */
+::-webkit-scrollbar {
+    background: var(--background-fill-primary);
+}
+::-webkit-scrollbar-thumb {
+    background-color: var(--border-color-primary);
+    border: 4px solid transparent;
+    border-radius: 100px;
+    background-clip: content-box;
+}
+::-webkit-scrollbar-corner {
+    background: var(--background-fill-primary);
+}
+.top-row {
+    display: flex;
+    align-items: center;
+}
+.top-row .image {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    background-color: transparent !important;
+    border: none !important;
+    box-shadow: none !important;
+    padding: 0 !important;
+    height: 150px;
+}
+.top-row .title-desc {
+    justify-content: center;
+    display: block;
+    height: 100%;
+    background: var(--block-title-background-fill);
+    border-radius: var(--radius-lg);
+    font: var(--font);
+    text-align: center;
+    padding: var(--scale-0)
+}
+.top-row .title-desc .title {
+    color: var(--block-title-text-color);
+    font-size: var(--text-xxl);
+    font-weight: var(--weight-extrabold);
+}
+.top-row .title-desc .desc {
+    padding: 0px !important;
+}
+.search-container {
+    display: flex;
+    align-items: flex-end;
+}
+.search-container .button-not-running,
+.search-container .button-running,
+.search-container .canceling {
+    cursor: pointer;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    color: var(--color-grey-100);
+    border-radius: var(--radius-md);
+    font-size: var(--text-xxl);
+    font-weight: var(--weight-extrabold);
+}
+.search-container .button-not-running {
+    background: linear-gradient(to right, var(--color-pink-500), var(--button-primary-background-fill));
+}
+.search-container .button-running {
+    background: var(--color-red-600);
+}
+.search-container .canceling {
+    background: var(--color-grey-500);
+}
+.jobs-container {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(var(--size-80), 1fr));
+}
+.card {
+    justify-self: center;
+    cursor: pointer;
+    border: none var(--spacing-md) var(--block-title-border-color);
+    border-radius: var(--radius-lg);
+    width: var(--size-80);
+    padding: var(--scale-2);
+    font: var(--font);
+    background: var(--block-label-background-fill);
+}
+.card:active {
+    transform: scale(0.97);
+}
+.card .title {
+    font-weight: var(--weight-bold);
+    font-size: var(--text-lg);
+    color: var(--block-title-text-color);
+    margin-bottom: var(--scale-0);
+}
+.card .title .running{
+    color: var(--color-red-600);
+    font-size: var(--text-md);
+}
+.card .title .stopped{
+    color: var(--color-grey-500);
+    font-size: var(--text-md);
+}
+.card .title .completed{
+    color: var(--color-green-700);
+    font-size: var(--text-md);
+}
+.card .info {
+    color: var(--block-info-text-color);
+    margin-bottom: var(--block-label-margin);
+}
+.details-container {
+    display: flex;
+    border-radius: 8px;
+}
+.details-container .title {
+    font-weight: var(--weight-bold);
+    font-size: var(--text-xl);
+    padding-left: var(--scale-0);
+    margin: var(--block-label-margin) 0 var(--block-label-margin) 0;
+}
+.details-container .info {
+    color: var(--block-info-text-color);
+    padding-left: var(--scale-0);
+    margin: var(--block-label-margin) 0 var(--block-label-margin) 0;
+}
+.details-container .items {
+    margin-top: var(--scale-4);
+}
+.details-container .items .item {
+    padding: var(--scale-0);
+    margin-bottom: var(--scale-0);
+    border: solid var(--block-label-border-width) var(--block-label-border-color);
+}
+.details-container .items .item .top{
+    display: flex;
+    align-items: center;
+}
+.details-container .items .item .top .favicon{
+    display: flex;
+    align-items: center;
+    margin-right: var(--scale-0);
+}
+.details-container .items .item .top .favicon img{
+    height: 1.5em;
+}
+.details-container .items .item .top .title{
+    display: flex;
+    align-items: center;
+}
+.details-container .items .item .bottom{
+    display: flex;
+    align-items: center;
+}
+.details-container .items .item .bottom .ner.mp,
+.details-container .items .item .bottom .ner.adr {
+    padding: 0px 5px;
+    border-radius: 4px;
+    font-weight: var(--weight-bold);
+}
+.details-container .items .item .bottom .ner.mp {
+    color: var(--color-grey-100);
+    background-color: var(--color-purple-700);
+}
+.details-container .items .item .bottom .ner.adr {
+    background-color: var(--color-pink);
+}

vianu/spock/assets/head/scripts.html ADDED Viewed

	@@ -0,0 +1,11 @@

+<script>
+  function cardClickHandler(card) {
+    var cards = document.getElementsByClassName("card");
+    for (let c of cards) {
+      c.style.borderStyle = "none";
+    }
+    card.style.borderStyle = "solid";
+  }
+</script>

vianu/spock/assets/images/favicon.png ADDED Viewed

vianu/spock/assets/images/spock_logo.png ADDED Viewed

vianu/spock/assets/images/spock_logo_circular.png ADDED Viewed

vianu/spock/launch_demo_app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import logging
+import os
+from vianu import LOG_FMT
+from vianu.spock.settings import LOG_LEVEL, GRADIO_SERVER_PORT
+from vianu.spock.app import App
+logging.basicConfig(level=LOG_LEVEL.upper(), format=LOG_FMT)
+os.environ["GRADIO_SERVER_PORT"] = str(GRADIO_SERVER_PORT)
+if __name__ == "__main__":
+    app = App()
+    demo = app.make()
+    demo.queue().launch(
+        favicon_path=app.favicon_path,
+        inbrowser=True,
+        allowed_paths=app.allowed_paths,
+    )

vianu/spock/launch_demo_pipeline.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from argparse import Namespace
+import asyncio
+from vianu.spock.__main__ import main
+from vianu.spock.settings import SCRAPING_SOURCES, MAX_DOCS_SRC
+from vianu.spock.settings import N_SCP_TASKS, N_NER_TASKS
+_ARGS = {
+    'term': 'ibuprofen',
+    'max_docs_src': MAX_DOCS_SRC,
+    'source': SCRAPING_SOURCES,
+    'model': 'llama',
+    'n_scp_tasks': N_SCP_TASKS,
+    'n_ner_tasks': N_NER_TASKS,
+    'log_level': 'DEBUG',
+}
+if __name__ == '__main__':
+    asyncio.run(main(args_=Namespace(**_ARGS), save=False))

vianu/spock/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+aiohttp==3.11.11
+beautifulsoup4==4.12.3
+dacite==1.8.1
+gradio==5.10.0
+numpy==2.2.1
+pymupdf==1.25.1
+python-dotenv==1.0.1
+openai==1.59.5
+defusedxml==0.7.1

vianu/spock/settings.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# General settings
+LOG_LEVEL = 'DEBUG'
+N_CHAR_DOC_ID = 12
+FILE_PATH = "/tmp/spock/"  # nosec
+FILE_NAME = "spock"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+# Gradio app settings
+GRADIO_APP_NAME = "SpoCK"
+GRADIO_SERVER_PORT=7868
+GRADIO_MAX_JOBS = 5
+GRADIO_UPDATE_INTERVAL = 2
+# Scraping settings
+SCRAPING_SOURCES = ['pubmed', 'ema', 'mhra', 'fda']
+MAX_CHUNK_SIZE = 500
+MAX_DOCS_SRC = 5
+N_SCP_TASKS = 2
+PUBMED_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+PUBMED_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+PUBMED_DB = 'pubmed'
+PUBMED_BATCH_SIZE = 20
+# NER settings
+LARGE_LANGUAGE_MODELS = ['llama', 'openai']
+MAX_TOKENS = 128.000
+LLAMA_MODEL='llama3.2'
+OPENAI_MODEL='gpt-4o'
+N_NER_TASKS = 2
+OLLAMA_BASE_URL_ENV_NAME = "OLLAMA_BASE_URL"
+OPENAI_API_KEY_ENV_NAME = "OPENAI_API_KEY"

vianu/spock/src/__init__.py ADDED Viewed

File without changes

vianu/spock/src/base.py ADDED Viewed

	@@ -0,0 +1,277 @@

+from abc import ABC, abstractmethod
+from argparse import Namespace
+from dataclasses import dataclass, asdict, field
+from datetime import datetime, timedelta
+from hashlib import sha256
+import json
+import logging
+import os
+from pathlib import Path
+import dacite
+import numpy as np
+from typing import List, Self
+logger = logging.getLogger(__name__)
+@dataclass
+class Serializable:
+    """Abstract base class for all dataclasses that can be serialized to a dictionary."""
+    def to_dict(self) -> dict:
+        """Converts the object to a dictionary."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, dict_: dict) -> Self:
+        """Creates an object from a dictionary."""
+        return dacite.from_dict(
+            data_class=cls,
+            data=dict_,
+            config=dacite.Config(type_hooks={datetime: datetime.fromisoformat})
+        )
+@dataclass
+class Identicator(ABC):
+    """Abstract base class for entities with customized id.
+    Notes
+        The identifier :param:`Identicator.id_` is hashed and enriched with `_id_prefix` if this is
+        not present. This means as long as the `id_` begins with `_id_prefix` nothing is done.
+        This behavior aims to allow:
+            SubIdenticator(id_='This is the string that identifies the entity')
+        and with _id_prefix='sub' it produces an id_ of the form:
+            id_ = 'sub_5d41402abc4b2a76b9719d911017c592'
+    """
+    id_: str
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Identicator):
+            return NotImplemented
+        return self.id_ == other.id_
+    def __post_init__(self):
+        if not self.id_.startswith(self._id_prefix):
+            self.id_ = self._id_prefix + self._hash_id_str()
+    def _hash_id_str(self):
+        return sha256(self.id_.encode()).hexdigest()
+    @property
+    @abstractmethod
+    def _id_prefix(self):
+        pass
+@dataclass(eq=False)
+class NamedEntity(Identicator, Serializable):
+    """Class for all named entities."""
+    text: str = field(default_factory=str)
+    class_: str = field(default_factory=str)
+    location: List[int] | None = None
+    @property
+    def _id_prefix(self):
+        return 'ent_'
+@dataclass(eq=False)
+class Document(Identicator, Serializable):
+    """Class containing any document related information."""
+    # mandatory document fields
+    text: str
+    source: str
+    # additional document fields
+    title: str | None = None
+    url: str | None = None
+    source_url: str | None = None
+    source_favicon_url: str | None = None
+    language: str | None = None
+    publication_date: datetime | None = None
+    # named entities
+    medicinal_products: List[NamedEntity] = field(default_factory=list)
+    adverse_reactions: List[NamedEntity] = field(default_factory=list)
+    # protected fields
+    _html: str | None = None
+    _html_hash: str | None = None
+    @property
+    def _id_prefix(self):
+        return 'doc_'
+    def remove_named_entity_by_id(self, id_: str) -> None:
+        """Removes a named entity from the document by a given `doc.id_`."""
+        self.medicinal_products = [ne for ne in self.medicinal_products if ne.id_ != id_]
+        self.adverse_reactions = [ne for ne in self.adverse_reactions if ne.id_ != id_]
+    def _get_html_hash(self) -> str:
+        """Creates a sha256 hash from the named entities' ids. If the sets of named entities have been modified, this
+        function will return a different hash.
+        """
+        ne_ids = [ne.id_ for ne in self.medicinal_products + self.adverse_reactions]
+        html_hash_str = ' '.join(ne_ids)
+        return sha256(html_hash_str.encode()).hexdigest()
+    def _get_html(self) -> str:
+        """Creates the HTML representation of the document with highlighted named entities."""
+        text = f"<div>{self.text}</div>"
+        # Highlight medicinal products accodring to the css class 'mp'
+        mp_template = "<span class='ner mp'>{text} | {class_}</span>"
+        for ne in self.medicinal_products:
+            text = text.replace(
+                ne.text, mp_template.format(text=ne.text, class_=ne.class_)
+            )
+        # Highlight adverse drug reactions accodring to the css class 'adr'
+        adr_template = "<span class='ner adr'>{text} | {class_}</span>"
+        for ne in self.adverse_reactions:
+            text = text.replace(
+                ne.text, adr_template.format(text=ne.text, class_=ne.class_)
+            )
+        return text
+    def get_html(self) -> str:
+        """Returns the HTML representation of the document with highlighted named entities. This function checks if
+        the set of named entities has been modified and updates the HTML representation if necessary."""
+        html_hash = self._get_html_hash()
+        if self._html is None or html_hash != self._html_hash:
+            self._html = self._get_html()
+            self._html_hash = html_hash
+        return self._html
+@dataclass(eq=False)
+class Setup(Identicator, Serializable):
+    """Class for the pipeline setup (closely related to the CLI arguments)."""
+    # generic options
+    log_level: str
+    max_docs_src: int
+    # scraping options
+    term: str
+    source: List[str]
+    n_scp_tasks: int
+    # NER options
+    model: str
+    n_ner_tasks: int
+    # optional fields
+    submission: datetime | None = None
+    file_path: str | None = None
+    file_name: str | None = None
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        self.submission = datetime.now() if self.submission is None else self.submission
+    @property
+    def _id_prefix(self) -> str:
+        return 'stp_'
+    def to_namespace(self) -> Namespace:
+        """Converts the :class:`Setup` object to a :class:`argparse.Namespace` object."""
+        return Namespace(**asdict(self))
+    @classmethod
+    def from_namespace(cls, args_: Namespace) -> Self:
+        """Creates a :class:`Setup` object from a :class:`argparse.Namespace` object."""
+        args_dict = vars(args_)
+        return cls(id_=str(args_dict), **args_dict)
+@dataclass
+class QueueItem:
+    """Class for the :class:`asyncio.Queue` items"""
+    id_: str
+    doc: Document
+@dataclass
+class SpoCK(Identicator, Serializable):
+    """Main class for the SpoCK pipeline mainly containing the job definition and the resulting data."""
+    # Generic fields
+    status: str | None = None
+    started_at: datetime | None = None
+    finished_at: datetime | None = None
+    # Pipeline fields
+    setup: Setup | None = None
+    data: List[Document] = field(default_factory=list)
+    @property
+    def _id_prefix(self) -> str:
+        return 'spk_'
+    def runtime(self) -> timedelta | None:
+        if self.started_at is not None:
+            if self.finished_at is None:
+                return datetime.now() - self.started_at
+            return self.finished_at - self.started_at
+        return None
+SpoCKList = List[SpoCK]
+class JSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder for the :class:`Document` class."""
+    def default(self, obj):
+        if isinstance(obj, datetime):
+            return obj.isoformat()
+        if isinstance(obj, Document):
+            return asdict(obj)
+        if isinstance(obj, np.float32):
+            return str(obj)
+        if isinstance(obj, np.int64):
+            return int(obj)
+        # Let the base class default method raise the TypeError
+        return json.JSONEncoder.default(self, obj)
+class FileHandler:
+    """Reads from and write data to a JSON file under a given file path."""
+    _suffix = '.json'
+    def __init__(self, file_path: Path | str) -> None:
+        self._file_path = Path(file_path) if isinstance(file_path, str) else file_path
+        if not self._file_path.exists():
+            os.makedirs(self._file_path)
+    def read(self, filename: str) -> List[Document]:
+        """Reads the data from a JSON file and casts it into a list of :class:`Document` objects."""
+        filename = (self._file_path / filename).with_suffix(self._suffix)
+        logger.info('reading data from file {filename}')
+        with open(filename.with_suffix(self._suffix), 'r', encoding="utf-8") as dfile:
+            dict_ = json.load(dfile)
+        return SpoCK.from_dict(dict_=dict_)
+    def write(self, file_name: str, spock: SpoCK, add_dt: bool = True) -> None:
+        """Writes the data to a JSON file.
+        If `add_dt=True`, the filename is `{file_name}_%Y%m%d%H%M%S.json`.
+        """
+        if add_dt:
+            file_name = f'{file_name}_{datetime.now().strftime("%Y%m%d%H%M%S")}'
+        file_name = (self._file_path / file_name).with_suffix(self._suffix)
+        logger.info(f'writing data to file {file_name}')
+        with open(file_name, 'w', encoding="utf-8") as dfile:
+            json.dump(spock.to_dict(), dfile, cls=JSONEncoder)

vianu/spock/src/cli.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""CLI for SpoCK
+"""
+import argparse
+from typing import Sequence
+from vianu.spock.settings import LOG_LEVEL, FILE_NAME, FILE_PATH, MAX_DOCS_SRC
+from vianu.spock.settings import SCRAPING_SOURCES, N_SCP_TASKS
+from vianu.spock.settings import N_NER_TASKS, LARGE_LANGUAGE_MODELS
+def parse_args(args_: Sequence) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="SpoCK", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    # Add generic options
+    gen_gp = parser.add_argument_group('generic')
+    gen_gp.add_argument("--log-level", metavar='', type=str, default=LOG_LEVEL, help='log level')
+    gen_gp.add_argument("--file-path", metavar='', type=str, default=FILE_PATH, help='path for storing results')
+    gen_gp.add_argument("--file-name", metavar='', type=str, default=FILE_NAME, help='filename for storing results')
+    gen_gp.add_argument("--max-docs-src", metavar='', type=int, default=MAX_DOCS_SRC, help='maximum number of documents per source')
+    # Add scraping group
+    scp_gp = parser.add_argument_group('scraping')
+    scp_gp.add_argument('--source', '-s', type=str, action='append', choices=SCRAPING_SOURCES, help='data sources for scraping')
+    scp_gp.add_argument('--term', '-t', metavar='', type=str, help='search term')
+    scp_gp.add_argument('--n-scp-tasks', metavar='', type=int, default=N_SCP_TASKS, help='number of async scraping tasks')
+    # Add NER group
+    ner_gp = parser.add_argument_group('ner')
+    ner_gp.add_argument('--model', '-m', type=str, choices=LARGE_LANGUAGE_MODELS, default='llama', help='NER model')
+    ner_gp.add_argument('--n-ner-tasks', metavar='', type=int, default=N_NER_TASKS, help='number of async ner tasks')
+    return parser.parse_args(args_)

vianu/spock/src/ner.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from abc import ABC, abstractmethod
+import aiohttp
+from argparse import Namespace
+import asyncio
+import logging
+import os
+import re
+from typing import List
+from dotenv import load_dotenv
+from openai import AsyncOpenAI
+from vianu.spock.src.base import NamedEntity, QueueItem     # noqa: F401
+from vianu.spock.settings import N_CHAR_DOC_ID, LLAMA_MODEL, OPENAI_MODEL
+logger = logging.getLogger(__name__)
+load_dotenv()
+NAMED_ENTITY_PROMPT = """
+You are an expert in Natural Language Processing. Your task is to identify named entities (NER) in a given text.
+You will focus on the following entities: adverse drug reaction (entity type: ADR), medicinal product (entity type: MP).
+Once you identified all named entities of the above types, you return them as a Python list of tuples of the form (text, type).
+It is important to only provide the Python list as your output, without any additional explanations or text.
+In addition, make sure that the named entity texts are exact copies of the original text segment
+Example 1:
+Input:
+"The most commonly reported side effects of dafalgan include headache, nausea, and fatigue."
+Output:
+[("dafalgan", "MP"), ("headache", "ADR"), ("nausea", "ADR"), ("fatigue", "ADR")]
+Example 2:
+Input:
+"Patients taking acetaminophen or naproxen have reported experiencing skin rash, dry mouth, and difficulty breathing after taking this medication. In rare cases, seizures have also been observed."
+Output:
+[("acetaminophen", "MP"), ("naproxen", "MP"), ("skin rash", "ADR"), ("dry mouth", "ADR"), ("difficulty breathing", "ADR"), ("seizures", "ADR")]
+Example 3:
+Input:
+"There are reported side effects as dizziness, stomach upset, and in some instances, temporary memory loss. These are mainly observed after taking Amitiza (lubiprostone) or Trulance (plecanatide)."
+Output:
+[("dizziness", "ADR"), ("stomach upset", "ADR"), ("temporary memory loss", "ADR"), ("Amitiza (lubiprostone)", "MP"), ("Trulance (plecanatide)", "MP")]
+"""
+class NER(ABC):
+    _named_entity_pattern = re.compile(r'\("([^"]+)",\s?"(MP|ADR)"\)')
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+    @staticmethod
+    def _get_loc_of_subtext(text: str, subtext: str) -> List[int] | None:
+        """Get the location of a subtext in a text."""
+        pos = text.find(subtext)
+        if pos == -1:
+            return None
+        return [pos, pos + len(subtext)]
+    def _add_loc_for_named_entities(self, text: str, named_entities: List[NamedEntity]) -> None:
+        txt_low = text.lower()
+        for ne in named_entities:
+            ne_txt_low = ne.text.lower()
+            loc = self._get_loc_of_subtext(text=txt_low, subtext=ne_txt_low)
+            if loc is not None:
+                ne.location = loc
+                ne.text = text[loc[0]:loc[1]]
+            else:
+                self.logger.warning(f'could not find location for named entity "{ne.text}" of class "{ne.class_}"')
+    @staticmethod
+    def _get_messages(text: str) -> List[dict]:
+        text = f'Process the following input text: "{text}"'
+        return [
+                {
+                    "role": "system",
+                    "content": NAMED_ENTITY_PROMPT,
+                },
+                {
+                    "role": "user",
+                    "content": text,
+                },
+            ]
+    @abstractmethod
+    async def _get_ner_model_answer(self, text: str) -> str:
+        raise NotImplementedError('OpenAINER._get_ner_model_answer is not implemented yet')
+    async def apply(self, queue_in: asyncio.Queue, queue_out: asyncio.Queue) -> None:
+        """Apply NER to a text received from input queue and put the results in an output queue."""
+        while True:
+            # Get text from input queue
+            item = await queue_in.get()   # type: QueueItem
+            # Check stopping condition
+            if item is None:
+                queue_in.task_done()
+                break
+            # Get the model response with named entities
+            id_ = item.id_
+            doc = item.doc
+            self.logger.debug(f'starting ner for item.id_={id_} (doc.id_={doc.id_[:N_CHAR_DOC_ID]})')
+            try:
+                text = doc.text
+                content = await self._get_ner_model_answer(text=text)
+            except Exception as e:
+                self.logger.error(f'error during ner for item.id_={item.id_} (doc.id_={doc.id_[:N_CHAR_DOC_ID]}): {e}')
+                queue_in.task_done()
+                continue
+            # Parse the model answer and remove duplicates
+            ne_list = re.findall(self._named_entity_pattern, content)
+            ne_list = list(set(ne_list))
+            # Create list of NamedEntity objects
+            named_entities = []
+            for ne in ne_list:
+                try:
+                    txt, cls_ = ne
+                    named_entities.append(NamedEntity(id_=f'{text} {txt} {cls_}', text=txt, class_=cls_))
+                except Exception as e:
+                    self.logger.error(f'error during creation of `NamedEntity` using {ne}: {e}')
+            # Add locations to named entities and remove those without location
+            self._add_loc_for_named_entities(text=text, named_entities=named_entities)
+            named_entities = [ne for ne in named_entities if ne.location is not None]
+            # Assign named entities to the document
+            ne_mp = [ne for ne in named_entities if ne.class_ == 'MP']
+            ne_adr = [ne for ne in named_entities if ne.class_ == 'ADR']
+            self.logger.debug(f'found #mp={len(ne_mp)} and #adr={len(ne_adr)} for item.id_={id_} (doc.id_={doc.id_[:N_CHAR_DOC_ID]})')
+            doc.medicinal_products = ne_mp
+            doc.adverse_reactions = ne_adr
+            # Put the document in the output queue
+            await queue_out.put(item)
+            queue_in.task_done()
+            self.logger.info(f'finished NER task for item.id_={id_} (doc.id_={doc.id_[:N_CHAR_DOC_ID]})')
+class OllamaNER(NER):
+    def __init__(self, base_url: str, model: str):
+        super().__init__()
+        self._base_url = base_url
+        self._model = model
+    def _get_http_data(self, text: str, stream: bool = False) -> dict:
+        messages = self._get_messages(text=text)
+        data = {
+            "model": self._model,
+            "messages": messages,
+            "stream": stream,
+        }
+        return data
+    async def _get_ner_model_answer(self, text: str) -> str:
+        data = self._get_http_data(text=text)
+        async with aiohttp.ClientSession() as session:
+            url = f'{self._base_url}/api/chat/'
+            async with session.post(url, json=data) as response:
+                response.raise_for_status()
+                resp_json = await response.json()
+                content = resp_json['message']['content']
+        return content
+class OpenAINER(NER):
+    def __init__(self, api_key: str, model: str):
+        super().__init__()
+        self._model = model
+        self._client = AsyncOpenAI(api_key=api_key)
+    async def _get_ner_model_answer(self, text: str) -> str:
+        messages = self._get_messages(text=text)
+        chat_completion = await self._client.chat.completions.create(
+            messages=messages,
+            model=self._model,
+        )
+        return chat_completion.choices[0].message.content
+def create_tasks(args_: Namespace, queue_in: asyncio.Queue, queue_out: asyncio.Queue) -> List[asyncio.Task]:
+    """Create asyncio NER tasks."""
+    n_ner_tasks = args_.n_ner_tasks
+    model = args_.model
+    if model == 'llama':
+        base_url = os.environ.get('OLLAMA_BASE_URL')
+        if base_url is None:
+            raise EnvironmentError("The ollama endpoint must be set by the OLLAMA_ENDPOINT environment variable")
+        ner = OllamaNER(base_url=base_url, model=LLAMA_MODEL)
+    elif model == 'openai':
+        api_key = os.environ.get('OPENAI_API_KEY')
+        if api_key is None:
+            raise EnvironmentError("The api_key for the OpenAI client must be set by the OPENAI_API_KEY environment variable")
+        ner = OpenAINER(api_key=api_key, model=OPENAI_MODEL)
+    else:
+        raise ValueError(f"unknown ner model '{args_.model}'")
+    logger.info(f'setting up {n_ner_tasks} NER task(s)')
+    tasks = [asyncio.create_task(ner.apply(queue_in=queue_in, queue_out=queue_out)) for _ in range(n_ner_tasks)]
+    return tasks

vianu/spock/src/scraping.py ADDED Viewed

	@@ -0,0 +1,922 @@

+"""Module for scraping data from different sources.
+The module contains three main classes:
+- :class:`Scraper`: Abstract base class for scraping data from different sources
+- :class:`PubmedScraper`: Class for scraping data from the PubMed database
+- :class:`EMAScraper`: Class for scraping data from the European Medicines Agency
+"""
+from abc import ABC, abstractmethod
+from argparse import Namespace
+import asyncio
+from dataclasses import dataclass
+from datetime import datetime
+from io import BytesIO
+import logging
+import re
+from typing import List
+import xml.etree.ElementTree as ET  # nosec
+import aiohttp
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+import defusedxml.ElementTree as DET
+import numpy as np
+import pymupdf
+from vianu.spock.src.base import Document, QueueItem    # noqa: F401
+from vianu.spock.settings import MAX_CHUNK_SIZE, SCRAPING_SOURCES
+from vianu.spock.settings import PUBMED_ESEARCH_URL, PUBMED_DB, PUBMED_EFETCH_URL, PUBMED_BATCH_SIZE
+logger = logging.getLogger(__name__)
+class Scraper(ABC):
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+    @abstractmethod
+    async def apply(self, args_: Namespace, queue_out: asyncio.Queue) -> None:
+        """Main function for scraping data from a source.
+        Args:
+            - args_: the arguments for the spock pipeline
+            - queue_out: the output queue for the scraped data
+        """
+        pass
+    @staticmethod
+    def split_text_into_chunks(text: str, chunk_size: int = MAX_CHUNK_SIZE, separator: str = ' ') -> List[str]:
+        """Split a text into chunks of a given max size."""
+        words = text.split(separator)
+        N = len(words)
+        s = min(chunk_size, N)
+        n = N // s
+        bnd = [round(i) for i in np.linspace(0, 1, n+1) * N]
+        chunks = [separator.join(words[start:stop]) for start, stop in zip(bnd[:-1], bnd[1:])]
+        return chunks
+    @staticmethod
+    async def _aiohttp_get_html(url: str, headers=None) -> str:
+        """Get the content of a given URL by an aiohttp GET request."""
+        async with aiohttp.ClientSession(headers=headers) as session:
+            async with session.get(url=url) as response:
+                response.raise_for_status()
+                text = await response.text()
+        return text
+@dataclass
+class PubmedEntrezHistoryParams:
+    """Class for optimizing Pubmed database retrieval for large numbers of documents.
+    An example can be found here:
+        https://www.ncbi.nlm.nih.gov/books/n/helpeutils/chapter3/#chapter3.Application_3_Retrieving_large
+    """
+    web: str
+    key: str
+    count: int
+class PubmedScraper(Scraper):
+    """Class for scraping data from the PubMed database.
+    The scraper uses the Pubmed API to search for relevant documents. From the list of results it creates a list of
+    :class:`Document` objects by the following main steps:
+        - Extract all PubmedArticle elements from the search results (other types are ignored)
+        - Extract the AbstractText from the PubmedArticle (if there is no abstract, the document is ignored)
+    """
+    _source = 'pubmed'
+    _source_url = 'https://pubmed.ncbi.nlm.nih.gov/'
+    _source_favicon_url = 'https://www.ncbi.nlm.nih.gov/favicon.ico'
+    _robots_txt_url = 'https://www.ncbi.nlm.nih.gov/robots.txt'
+    @staticmethod
+    def _get_entrez_history_params(text: str) -> PubmedEntrezHistoryParams:
+        """Retrieving the entrez history parameters for optimized search when requesting large numbers of documents.
+        An example can be found here:
+            https://www.ncbi.nlm.nih.gov/books/n/helpeutils/chapter3/#chapter3.Application_3_Retrieving_large
+        """
+        web = re.search(r'<WebEnv>(\S+)</WebEnv>', text).group(1)
+        key = re.search(r'<QueryKey>(\d+)</QueryKey>', text).group(1)
+        count = int(re.search(r'<Count>(\d+)</Count>', text).group(1))
+        return PubmedEntrezHistoryParams(web=web, key=key, count=count)
+    async def _pubmed_esearch(self, term: str) -> str:
+        """Search the Pubmed database with a given term and POST the results to entrez history server."""
+        url = f'{PUBMED_ESEARCH_URL}?db={PUBMED_DB}&term={term}&usehistory=y'
+        self.logger.debug(f'search pubmed database with url={url}')
+        esearch = await self._aiohttp_get_html(url=url)
+        return esearch
+    async def _pubmed_efetch(self, params: PubmedEntrezHistoryParams, max_docs_src: int) -> List[str]:
+        """Retrieve the relevant documents from the entrez history server."""
+        # Reduce the number of documents to be retrieved for efficiency
+        N = min(max_docs_src, int(params.count))
+        if N < params.count:
+            self.logger.warning(f'from the total number of documents={params.count} only {N} will be retrieved')
+        # Iterate over the batches of documents (with fixed batch size)
+        batch_size = min(params.count, PUBMED_BATCH_SIZE)
+        self.logger.debug(f'fetch #docs={N} in {N // batch_size + 1} batch(es) of size <= {batch_size}')
+        batches = []
+        for retstart in range(0, N, batch_size):
+            # Prepare URL for retrieving next batch of documents but stop if the maximum number is reached
+            retmax = min(max_docs_src - len(batches)*batch_size, batch_size)
+            url = f'{PUBMED_EFETCH_URL}?db={PUBMED_DB}&WebEnv={params.web}&query_key={params.key}&retstart={retstart}&retmax={retmax}'
+            self.logger.debug(f'fetch documents with url={url}')
+            # Fetch the documents
+            efetch = await self._aiohttp_get_html(url=url)
+            batches.append(efetch)
+        return batches
+    def _extract_medline_citation(self, element: ET.Element) -> ET.Element | None:
+        """Extract the MedlineCitation element from a PubmedArticle element."""
+        # Find and extract the MedlineCitation element
+        citation = element.find('MedlineCitation')
+        if citation is None:
+            self.logger.warning('no "MedlineCitation" element found')
+            return None
+        return citation
+    @staticmethod
+    def _extract_pmid(element: ET.Element) -> str | None:
+        """Extract the PMID from a MedlineCitation element."""
+        pmid = element.find('PMID')
+        return pmid.text if pmid is not None else None
+    @staticmethod
+    def _extract_article(element: ET.Element) -> ET.Element | None:
+        """Extract the article element from a PubmedArticle element."""
+        # Find and extract the Article element
+        article = element.find('Article')
+        return article
+    @staticmethod
+    def _extract_title(article: ET.Element) -> str | None:
+        """Extract the title from an Article element."""
+        title = article.find('ArticleTitle')
+        return title.text if title is not None else None
+    @staticmethod
+    def _extract_abstract(article: ET.Element) -> str | None:
+        """Extract the abstract from an Article element."""
+        separator = '\n\n'
+        abstract = article.find('Abstract')
+        if abstract is not None:
+            abstract = separator.join([a.text for a in abstract.findall('AbstractText') if a.text is not None])
+        return abstract
+    @staticmethod
+    def _extract_language(article: ET.Element) -> str | None:
+        """Extract the language from an Article element."""
+        language = article.find('Language')
+        return language.text if language is not None else None
+    @staticmethod
+    def _extract_date(article: ET.Element) -> datetime | None:
+        """Extract the publication date from an Article element."""
+        date = article.find('ArticleDate')
+        if date is None:
+            return None
+        year = int(date.find('Year').text)
+        month = int(date.find('Month').text)
+        day = int(date.find('Day').text)
+        return datetime(year=year, month=month, day=day)
+    @staticmethod
+    def _extract_publication_types(article: ET.Element) -> List[str]:
+        """Extract the publication types from an Article element."""
+        return [t.text for t in article.find('PublicationTypeList').findall('PublicationType')]
+    def _parse_pubmed_articles(self, batches: List[str]) -> List[Document]:
+        """Parse batches of ET.Elements into a single list of Document objects"""
+        data = []
+        for ib, text in enumerate(batches):
+            pubmed_articles = DET.fromstring(text).findall('PubmedArticle')
+            self.logger.debug(f'found #articles={len(pubmed_articles)} in batch {ib}')
+            for ie, element in enumerate(pubmed_articles):
+                self.logger.debug(f'parsing PubmedArticle {ie} of batch {ib}')
+                # Extract MedlineCitation and its PMID from PubmedArticle
+                citation = self._extract_medline_citation(element=element)
+                if citation is None:
+                    self.logger.debug(f'no citation found in PubmedArticle {ie} of batch {ib}')
+                    continue
+                pmid = self._extract_pmid(element=citation)
+                # Extract the Article element from the PubmedArticle
+                article = self._extract_article(element=citation)
+                if article is None:
+                    self.logger.debug(f'no article found in PubmedArticle {ie} of batch {ib}')
+                    continue
+                # Extract the relevant information from the Article element
+                title = self._extract_title(article=article)
+                text = self._extract_abstract(article=article)
+                if text is None:
+                    self.logger.debug(f'no abstract found in PubmedArticle {ie} of batch {ib}')
+                    continue
+                language = self._extract_language(article=article)
+                publication_date = self._extract_date(article=article)
+                # Split long texts into chunks
+                texts = self.split_text_into_chunks(text=text)
+                # Create the Document object(s)
+                for txt in texts:
+                    document = Document(
+                        id_=f'{self._source_url} {title} {txt} {language} {publication_date}',
+                        text=txt,
+                        source=self._source,
+                        title=title,
+                        url=f'{self._source_url}{pmid}/',
+                        source_url=self._source_url,
+                        source_favicon_url=self._source_favicon_url,
+                        language=language,
+                        publication_date=publication_date,
+                    )
+                    data.append(document)
+        self.logger.debug(f'parsed #docs={len(data)} from #batches={len(batches)}')
+        return data
+    async def apply(self, args_: Namespace, queue_out: asyncio.Queue) -> None:
+        """Query and retrieve all PubmedArticle Documents for the given search term.
+        The retrieval is using two main functionalities of the Pubmed API:
+        - ESearch: Identify the relevant documents and store them in the entrez history server
+        - EFetch: Retrieve the relevant documents from the entrez history server
+        Args:
+            - args_: the arguments for the spock pipeline
+            - queue_out: the output queue for the scraped data
+        """
+        term = args_.term
+        max_docs_src = args_.max_docs_src
+        self.logger.debug(f'starting scraping the source={self._source} with term={term}')
+        # Search for relevant documents with a given term
+        esearch = await self._pubmed_esearch(term=term)
+        # Retrieve relevant documents in batches
+        params = self._get_entrez_history_params(esearch)
+        batches = await self._pubmed_efetch(params=params, max_docs_src=max_docs_src)
+        # Parse documents from batches
+        documents = self._parse_pubmed_articles(batches=batches)
+        documents = documents[:max_docs_src]
+        # Add documents to the queue
+        for i, doc in enumerate(documents):
+            id_ = f'{self._source}_{i}'
+            item = QueueItem(id_=id_, doc=doc)
+            await queue_out.put(item)
+        self.logger.info(f'retrieved #docs={len(documents)} in source={self._source} for term={term}')
+@dataclass
+class SearchResults:
+    """Class for storing the search results from different databases."""
+    count: int | None
+    n_pages: int | None
+    items: List[Tag]
+class EMAScraper(Scraper):
+    """Class for scraping data from the European Medicines Agency.
+    The scraper uses the same API as the web interface of the EMA to search for relevant documents. From the list of
+    results it creates a list of :class:`Document` objects by the following main steps:
+        - Search the EMA database (filter for PDF documents only)
+        - Extract the text from the PDF documents
+        - Use regex to find texts where adverse drug reactions (or similar) are mentioned
+        - Return the most recent :class:`Document` objects
+    """
+    _source = 'ema'
+    _source_url = 'https://www.ema.europa.eu'
+    _source_favicon_url = 'https://www.ema.europa.eu/themes/custom/ema_theme/favicon.ico'
+    _pdf_search_template = (
+        "https://www.ema.europa.eu/en/search?search_api_fulltext={term}"
+        "&f%5B0%5D=ema_search_custom_entity_bundle%3Adocument"  # This part is added to only retrieve PDF documents
+        "&f%5B1%5D=ema_search_entity_is_document%3ADocument"
+    )
+    _headers = {
+        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+        'Host': 'www.ema.europa.eu',
+    }
+    _robots_txt_url = 'https://www.ema.europa.eu/robots.txt'
+    def _extract_search_results_count(self, soup: BeautifulSoup) -> int | None:
+        """Extract the number of search results."""
+        span = soup.find("span", class_="source-summary-count")
+        if span is None:
+            self.logger.warning('no search results count found')
+            return None
+        return int(span.text.strip('()'))
+    def _extract_number_of_pages(self, soup: BeautifulSoup) -> int | None:
+        """Extract the number of pages from the search results."""
+        nav = soup.find('nav', class_='pager')
+        if nav is not None:
+            a = nav.find('a', {'class': 'page-link', 'aria-label': 'Last'})
+            if a and a.has_attr('href'):
+                href = a['href']
+                match = re.search(r'&page=(\d+)', href)
+                if match:
+                    return int(match.group(1)) + 1
+        self.logger.warning('no pager found')
+        return None
+    @staticmethod
+    def _extract_search_item_divs(soup: BeautifulSoup) -> List[Tag]:
+        """Extract the list of div elements contining the different search results."""
+        parent = soup.find('div', class_=['row', 'row-cols-1'])
+        return parent.find_all('div', class_='col')
+    async def _ema_document_search(self, term: str, max_docs_src: int) -> SearchResults:
+        """Search the EMA database for PDF documents with a given term."""
+        # Get initial search results
+        url = self._pdf_search_template.format(term=term)
+        self.logger.debug(f'search ema database with url={url}')
+        content = await self._aiohttp_get_html(url=url, headers=self._headers)
+        soup = BeautifulSoup(content, 'html.parser')
+        # Get the number of search results and number of pages
+        count = self._extract_search_results_count(soup=soup)
+        n_pages = self._extract_number_of_pages(soup=soup)
+        # Extract the divs containing the search results
+        items = []
+        if count is not None and count > 0:
+            # Extract items from page=0
+            items_from_page = self._extract_search_item_divs(soup=soup)
+            items.extend(items_from_page)
+            # Extract items from page=1, 2, ...
+            if n_pages is not None and n_pages > 1:
+                for i in range(1, n_pages):
+                    url = f'{url}&page={i}'
+                    content = await self._aiohttp_get_html(url=url, headers=self._headers)
+                    soup = BeautifulSoup(content, 'html.parser')
+                    items_from_page = self._extract_search_item_divs(soup=soup)
+                    items.extend(items_from_page)
+                    if len(items) >= max_docs_src:
+                        self.logger.debug(f'found #items={len(items)} in #pages={i+1}')
+                        break
+            # Check for extraction mismatch
+            if len(items) != count:
+                self.logger.warning(f'mismatch #items={len(items)} and the total count={count}')
+        self.logger.debug(f'extracted #items={len(items)} in #pages={n_pages}')
+        return SearchResults(count=count, n_pages=n_pages, items=items)
+    @staticmethod
+    def _extract_title(tag: Tag) -> str | None:
+        """Extract the title of the document."""
+        title = tag.find('p', class_='file-title')
+        return title.text if title is not None else None
+    def _extract_url(self, tag: Tag) -> str | None:
+        """Extract the links href to the relevant PDF document."""
+        link = tag.find('a', href=True)
+        if link is None:
+            self.logger.warning('no link found')
+            return None
+        href = link['href']
+        url = f'{self._source_url}{href}' if href.startswith('/') else href
+        if not url.endswith('.pdf'):
+            self.logger.warning(f'url={url} does not point to a PDF document')
+            return None
+        return url
+    async def _extract_text(self, url: str) -> str:
+        """Extract the text from the PDF document."""
+        async with aiohttp.ClientSession(headers=self._headers) as session:
+            async with session.get(url) as response:
+                response.raise_for_status()
+                content = await response.read()  # Read the entire content
+                # Create a BytesIO object from the content
+                pdf_stream = BytesIO(content)
+                # Open the PDF with PyMuPDF using the BytesIO object
+                doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
+                # Extract text from all pages
+                text = '\n'.join([page.get_text() for page in doc])
+                # Close the document
+                doc.close()
+        return text
+    @staticmethod
+    def _extract_language(tag: Tag) -> str | None:
+        """Extract the language of the document."""
+        lang_tag = tag.find('p', class_='language-meta')
+        if lang_tag is None:
+            return None
+        text = lang_tag.text
+        start = text.find('(')
+        stop = text.find(')')
+        if start != -1 and stop != -1:
+            return text[start+1:stop].lower()
+        return None
+    @staticmethod
+    def _extract_date(tag: Tag) -> datetime | None:
+        """Extract the publication date of the document."""
+        time_tag = tag.find('time')
+        if time_tag and time_tag.has_attr('datetime'):
+            return datetime.fromisoformat(time_tag['datetime'])
+        return None
+    async def _parse_items(self, items: List[Tag]) -> List[Document]:
+        """From a list of divs containing the search results, extract the relevant information and parse it into a list
+        of :class:`Document` objects.
+        """
+        data = []
+        for i, tag in enumerate(items):
+            url = self._extract_url(tag=tag)
+            if url is None:
+                self.logger.debug(f'no url found for item {i}')
+                continue
+            # Extract the relevant information from the document
+            self.logger.debug(f'parsing document with url={url}')
+            title = self._extract_title(tag=tag)
+            text = await self._extract_text(url=url)
+            language = self._extract_language(tag=tag)
+            publication_date = self._extract_date(tag=tag)
+            # Split long texts into chunks
+            texts = self.split_text_into_chunks(text=text)
+            # Create the Document object(s)
+            for text in texts:
+                document = Document(
+                    id_=f'{self._source_url} {title} {text} {language} {publication_date}',
+                    text=text,
+                    source=self._source,
+                    title=title,
+                    url=url,
+                    source_url=self._source_url,
+                    source_favicon_url=self._source_favicon_url,
+                    language=language,
+                    publication_date=publication_date,
+                )
+                data.append(document)
+        self.logger.debug(f'created #docs={len(data)}')
+        return data
+    async def apply(self, args_: Namespace, queue_out: asyncio.Queue) -> None:
+        """Query and retrieve all PRAC documents for the given search term.
+        Args:
+            - args_: the arguments for the spock pipeline
+            - queue_out: the output queue for the scraped data
+        """
+        term = args_.term
+        max_docs_src = args_.max_docs_src
+        self.logger.debug(f'starting scraping the source={self._source} with term={term}')
+        # Search for relevant documents with a given term
+        search_results = await self._ema_document_search(term=term, max_docs_src=max_docs_src)
+        n_items = len(search_results.items)
+        if n_items > max_docs_src:
+            self.logger.warning(f'from #items={n_items} only max_docs_src={max_docs_src} will be parsed')
+        items = search_results.items[:max_docs_src]
+        # Parse the documents
+        data = await self._parse_items(items=items)
+        n_data = len(data)
+        if n_data > max_docs_src:
+            self.logger.warning(f'the #items={n_items} were chunked into #documents={n_data} from where only max_docs_src={max_docs_src} will be added to the queue')
+        data = data[:max_docs_src]
+        # Add documents to the queue
+        for i, doc in enumerate(data):
+            id_ = f'{self._source}_{i}'
+            item = QueueItem(id_=id_, doc=doc)
+            await queue_out.put(item)
+        self.logger.info(f'retrieved #docs={len(data)} in source={self._source} for term={term}')
+class MHRAScraper(Scraper):
+    """Class for scraping data from the Medicines and Healthcare products Regulatory Agency.
+    The scraper the MHRAs **Drug Safety Update** search API for retrieving relevant documents. From the list of results
+    it creates a list of :class:`Document` objects.
+    """
+    _source = 'mhra'
+    _source_url = 'https://www.gov.uk/durg-safety-update'
+    _source_favicon_url = 'https://www.gov.uk/favicon.ico'
+    _search_template = 'https://www.gov.uk/drug-safety-update?keywords={term}'
+    _source_base_url = 'https://www.gov.uk'
+    _language = 'en'
+    def _extract_search_results_count(self, parent: Tag) -> int | None:
+        """Extract the number of search results."""
+        div = parent.find('div', class_='result-info__header')
+        h2 = div.find('h2') if div else None
+        if h2 is None:
+            self.logger.warning('no search results count found')
+            return None
+        text = h2.get_text(strip=True)
+        count = int(re.search(r'\d+', text).group())
+        return count
+    @staticmethod
+    def _extract_search_item_divs(parent: Tag) -> List[Tag]:
+        """Extract the divs containing the search results."""
+        return parent.find_all('li', class_='gem-c-document-list__item')
+    async def _mhra_document_search(self, term: str) -> SearchResults:
+        """Search the MHRA database for documents with a given term."""
+        # Get search results and extract divs containing the search results
+        url = self._search_template.format(term=term)
+        self.logger.debug(f"search mhra's drug safety update database with url={url}")
+        content = await self._aiohttp_get_html(url=url)
+        soup = BeautifulSoup(content, 'html.parser')
+        parent = soup.find('div', class_=['govuk-grid-column-two-thirds', 'js-live-search-results-block', 'filtered-results'])
+        # Extract the number of search results
+        count = self._extract_search_results_count(parent=parent)
+        # Extract the divs containing the search results
+        items = []
+        if count is not None and count > 0:
+            items = self._extract_search_item_divs(parent=parent)       # For a given search term, the site shows all the results without pagination.
+            # Check for extraction mismatch
+            if len(items) != count:
+                self.logger.warning(f'mismatch #items={len(items)} and the total count={count}')
+        self.logger.debug(f'found #items={len(items)}')
+        return SearchResults(count=count, items=items)
+    def _extract_url(self, link: Tag) -> str | None:
+        """Extract the url to the document."""
+        href = link['href']
+        return f'{self._source_base_url}{href}' if href.startswith('/') else href
+    async def _extract_text(self, url: str) -> str:
+        """Extract the text from the document."""
+        content = await self._aiohttp_get_html(url=url)
+        soup = BeautifulSoup(content, 'html.parser')
+        main = soup.find('main')
+        text = main.get_text()
+        # Clean the spaces and newlines
+        text = re.sub(r'(\n\s*){3,}', '\n\n', text)
+        text = re.sub(r'^ +', '', text, flags=re.MULTILINE)
+        return text
+    @staticmethod
+    def _extract_date(tag: Tag) -> datetime | None:
+        """Extract the publication date of the document."""
+        time_tag = tag.find('time')
+        if time_tag and time_tag.has_attr('datetime'):
+            return datetime.fromisoformat(time_tag['datetime'])
+        return None
+    async def _parse_items(self, items: List[Tag]) -> List[Document]:
+        """From a list of divs containing the search results (items), extract the relevant information and parse it into a list
+        of :class:`Document` objects.
+        """
+        data = []
+        for i, tag in enumerate(items):
+            link = tag.find('a', href=True)
+            if link is None:
+                self.logger.warning(f'no link found for item {i}')
+                continue
+            url = self._extract_url(link=link)
+            # Extract the relevant information from the document
+            self.logger.debug(f'parsing item with url={url}')
+            title = link.get_text(strip=True)
+            text = await self._extract_text(url=url)
+            publication_date = self._extract_date(tag=tag)
+            # Split long texts into chunks
+            texts = self.split_text_into_chunks(text=text)
+            # Create the Document object(s)
+            for text in texts:
+                document = Document(
+                    id_=f'{self._source_url} {title} {text} {publication_date}',
+                    text=text,
+                    source=self._source,
+                    title=title,
+                    url=url,
+                    source_url=self._source_url,
+                    source_favicon_url=self._source_favicon_url,
+                    language=self._language,
+                    publication_date=publication_date,
+                )
+                data.append(document)
+        self.logger.debug(f'created #docs={len(data)}')
+        return data
+    async def apply(self, args_: Namespace, queue_out: asyncio.Queue) -> None:
+        """Query and retrieve all drug safety updates for the given search term.
+        Args:
+            - args_: the arguments for the spock pipeline
+            - queue_out: the output queue for the scraped data
+        """
+        term = args_.term
+        max_docs_src = args_.max_docs_src
+        self.logger.debug(f'starting scraping the source={self._source} with term={term}')
+        # Search for relevant documents with a given term
+        search_results = await self._mhra_document_search(term=term)
+        n_items = len(search_results.items)
+        if n_items > max_docs_src:
+            self.logger.warning(f'from #items={n_items} only max_docs_src={max_docs_src} will be parsed')
+        items = search_results.items[:max_docs_src]
+        # Parse the documents
+        data = await self._parse_items(items=items)
+        n_data = len(data)
+        if n_data > max_docs_src:
+            self.logger.warning(f'the #items={n_items} were chunked into #documents={n_data} from where only max_docs_src={max_docs_src} will be added to the queue')
+        data = data[:max_docs_src]
+        # Add documents to the queue
+        for i, doc in enumerate(data):
+            id_ = f'{self._source}_{i}'
+            item = QueueItem(id_=id_, doc=doc)
+            await queue_out.put(item)
+        self.logger.info(f'retrieved #docs={len(data)} in source={self._source} for term={term}')
+class FDAScraper(Scraper):
+    """Class for scraping data from the Food and Drug Administration.
+    The scraper uses the  the same API as the web interface fo the FDA to search for relevant documents. By default the search applies the following filter:
+        - sorting by highest relevance
+        - filter for results from the Center of Drug Evaluation and Research
+        - filter for English language
+        - filter for Drugs
+    """
+    _source = 'fda'
+    _source_url = 'https://www.fda.gov'
+    _source_favicon_url = 'https://www.fda.gov/favicon.ico'
+    _search_template = (
+        'https://www.fda.gov/search?s={term}'
+        '&items_per_page=10'
+        '&sort_bef_combine=rel_DESC'    # Sort by relevance
+        '&f%5B0%5D=center%3A815'        # Filter for the Center for Drug Evaluation and Research
+        '&f%5B1%5D=language%3A1404'     # Filter for English language
+        '&f%5B2%5D=prod%3A2312'         # Filter for the Drugs section
+    )
+    _language = 'en'
+    def _extract_search_results_count(self, soup: BeautifulSoup) -> int | None:
+        """Extract the number of search results from the search info section."""
+        parent = soup.find('div', class_='lcds-search-filters__info')
+        if parent is not None:
+            div = parent.find('div', class_='view-header')
+            match = re.search(r'of (\d+) entr[y|ies]', div.text)
+            if match:
+                return int(match.group(1))
+        self.logger.warning('no search info found')
+        return None
+    def _extract_number_of_pages(self, soup: BeautifulSoup) -> int | None:
+        """Extract the number of pages from the search results."""
+        nav = soup.find('nav', class_=['pager-nav', 'text-center'])
+        if nav is not None:
+            last_page = nav.find('li', class_=['pager__item', 'pager__item--last'])
+            a = last_page.find('a')
+            if a and a.has_attr('href'):
+                href = a['href']
+                match = re.search(r'&page=(\d+)', href)
+                if match:
+                    return int(match.group(1)) + 1
+        self.logger.warning('no pager found')
+        return None
+    @staticmethod
+    def _extract_search_item_divs(soup: BeautifulSoup) -> List[Tag]:
+        """Extract the divs containing the search results."""
+        parent = soup.find('div', class_='view-content')
+        return parent.find_all('div', recursive=False)
+    async def _fda_document_search(self, term: str, max_docs_src: int) -> SearchResults:
+        """Search the FDA database for documents with a given term."""
+        # Get search results
+        url = self._search_template.format(term=term)
+        self.logger.debug(f'search fda database with url={url}')
+        content = await self._aiohttp_get_html(url=url)
+        soup = BeautifulSoup(content, 'html.parser')
+        # Get the number of search results and the number of pages
+        count = self._extract_search_results_count(soup=soup)
+        n_pages = self._extract_number_of_pages(soup=soup)
+        # Extract the divs containing the search results
+        items = []
+        if count is not None and count > 0:
+            # Extract items from page=0
+            items_from_page = self._extract_search_item_divs(soup=soup)
+            items.extend(items_from_page)
+            # Extract items from page=1, 2, ...
+            if n_pages is not None and n_pages > 1:
+                for i in range(1, n_pages):
+                    url = f'{url}&page={i}'
+                    content = await self._aiohttp_get_html(url=url)
+                    soup = BeautifulSoup(content, 'html.parser')
+                    items_from_page = self._extract_search_item_divs(soup=soup)
+                    items.extend(items_from_page)
+                    if len(items) >= max_docs_src:
+                        self.logger.debug(f'found #items={len(items)} in #pages={i+1}')
+                        break
+            # Check for extraction mismatch
+            if len(items) != count:
+                self.logger.warning(f'mismatch #items={len(items)} and the total count={count}')
+        self.logger.debug(f'extracted #items={len(items)} in #pages={n_pages}')
+        return SearchResults(count=count, n_pages=n_pages, items=items)
+    @staticmethod
+    def _extract_title(main: Tag) -> str | None:
+        """Extract the title of the document."""
+        header = main.find('header', class_=['row', 'content-header'])
+        if header is not None:
+            h1 = header.find('h1', class_=['content-title', 'text-center'])
+            if h1 is not None:
+                return h1.text
+        return None
+    @staticmethod
+    def _extract_text(main: Tag) -> str | None:
+        """Extract the text from the document."""
+        body = main.find('div', attrs={'class': 'col-md-8 col-md-push-2', 'role': 'main'})
+        return body.get_text() if body is not None else None
+    @staticmethod
+    def _extract_date(main: Tag) -> datetime | None:
+        """Extract the publication date of the document."""
+        dl = main.find('dl', class_='lcds-description-list--grid')
+        if dl is not None:
+            dd = dl.find('dd', class_='cell-2_2')
+            time_tag = dd.find('time')
+        else:
+            time_tag = main.find('time')
+        if time_tag and time_tag.has_attr('datetime'):
+            return datetime.fromisoformat(time_tag['datetime'])
+        return None
+    async def _parse_item_page(self, url: str) -> List[Document]:
+        content = await self._aiohttp_get_html(url=url)
+        soup = BeautifulSoup(content, 'html.parser')
+        main = soup.find('main')
+        # Extract the relevant information from the document
+        title = self._extract_title(main=main)
+        text = self._extract_text(main=main)
+        publication_date = self._extract_date(main=main)
+        # Split long texts into chunks
+        texts = self.split_text_into_chunks(text=text)
+        # Create the Document object(s)
+        data = []
+        for text in texts:
+            document = Document(
+                id_=f'{self._source_url} {title} {text} {publication_date}',
+                text=text,
+                source=self._source,
+                title=title,
+                url=url,
+                source_url=self._source_url,
+                source_favicon_url=self._source_favicon_url,
+                language=self._language,
+                publication_date=publication_date,
+            )
+            data.append(document)
+        return data
+    async def _parse_items(self, items: List[Tag]) -> List[Document]:
+        """From a list of divs containing the search results, extract the relevant information and parse it into a list
+        of :class:`Document` objects.
+        """
+        data = []
+        for i, tag in enumerate(items):
+            link = tag.find('a', href=True)
+            if link is None:
+                self.logger.warning(f'no link found for item {i}')
+                continue
+            url = self._source_url + link['href']
+            # Create the Document object
+            self.logger.debug(f'parsing item with url={url}')
+            page_data = await self._parse_item_page(url=url)
+            data.extend(page_data)
+        self.logger.debug(f'created #docs={len(data)}')
+        return data
+    async def apply(self, args_: Namespace, queue_out: asyncio.Queue) -> None:
+        term = args_.term
+        max_docs_src = args_.max_docs_src
+        self.logger.debug(f'starting scraping the source={self._source} with term={term}')
+        # Search for relevant documents with a given term
+        search_results = await self._fda_document_search(term=term, max_docs_src=max_docs_src)
+        n_items = len(search_results.items)
+        if n_items > max_docs_src:
+            self.logger.warning(f'from #items={n_items} only max_docs_src={max_docs_src} will be parsed')
+        items = search_results.items[:max_docs_src]
+        # Parse the documents
+        data = await self._parse_items(items=items)
+        n_data = len(data)
+        if n_data > max_docs_src:
+            self.logger.warning(f'the #items={n_items} were chunked into #documents={n_data} from where only max_docs_src={max_docs_src} will be added to the queue')
+        data = data[:max_docs_src]
+        # Add documents to the queue
+        for i, doc in enumerate(data):
+            id_ = f'{self._source}_{i}'
+            item = QueueItem(id_=id_, doc=doc)
+            await queue_out.put(item)
+        self.logger.info(f'retrieved #docs={len(data)} in source={self._source} for term={term}')
+_SCRAPERS = [PubmedScraper, EMAScraper, MHRAScraper, FDAScraper]
+_SOURCE_TO_SCRAPER = {src: scr for src, scr in zip(SCRAPING_SOURCES, _SCRAPERS)}
+if not len(_SCRAPERS) == len(SCRAPING_SOURCES):
+    raise ValueError("number of scrapers and sources do not match")
+async def _scraping(args_: Namespace, queue_in: asyncio.Queue, queue_out: asyncio.Queue) -> None:
+    """Pop a source (str) from the input queue, perform  the scraping task with the given term, and put the results in
+    the output queue until the input queue is empty.
+    Args:
+        - args_: the arguments for the spock pipeline
+        - queue_in: the input queue containing the sources to scrape
+        - queue_out: the output queue for the scraped data
+    """
+    while True:
+        # Get source from input queue
+        source = await queue_in.get()
+        # Check stopping condition
+        if source is None:
+            queue_in.task_done()
+            break
+        # Get the scraper and apply it to the term
+        scraper = _SOURCE_TO_SCRAPER.get(source)    # type: type[Scraper]
+        if scraper is None:
+            logger.error(f'unknown source={source}')
+            queue_in.task_done()
+            break
+        try:
+            await scraper().apply(args_=args_, queue_out=queue_out)
+        except Exception as e:
+            logger.error(f'error during scraping for source={source} and term={args_.term}: {e}')
+            queue_in.task_done()
+            continue
+        queue_in.task_done()
+def create_tasks(args_: Namespace, queue_in: asyncio.Queue, queue_out: asyncio.Queue) -> List[asyncio.Task]:
+    """Create the asyncio scraping tasks."""
+    n_tasks = args_.n_scp_tasks
+    logger.info(f'setting up {n_tasks} scraping task(s) for source(s)={args_.source}')
+    tasks = [asyncio.create_task(_scraping(args_=args_, queue_in=queue_in, queue_out=queue_out)) for _ in range(n_tasks)]
+    return tasks