MinerU

Paused

App Files Files Community

princhman commited on Feb 17, 2025

Commit

1e9bbdd

1 Parent(s): afe430b

update

Browse files

Files changed (1) hide show

worker.py +112 -165

worker.py CHANGED Viewed

@@ -6,174 +6,121 @@ import threading
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
 import pika
 from mineru_single import Processor
-processor = Processor()
-def run_pipeline(body_bytes: bytes):
-    """
-    1) Decode the body bytes to a string.
-    2) Parse the JSON. We expect something like:
-       {
-         "headers": {"request_type": "process_files", "request_id": "..."},
-         "body": {
-             "input_files": [...],
-             "topics": [...]
-         }
-       }
-    3) If request_type == "process_files", call processor.process_batch(...) on the URLs.
-    4) Return raw_text_outputs (str) and parsed_json_outputs (dict).
-    """
-    body_str = body_bytes.decode("utf-8")
-    data = json.loads(body_str)
-    headers = data.get("headers", {})
-    request_type = headers.get("request_type", "")
-    request_id = headers.get("request_id", "")
-    body = data.get("body", {})
-    # If it's not "process_files", we do nothing special
-    if request_type != "process_files":
-        return "No processing done", data
-    # Gather file URLs
-    input_files = body.get("input_files", [])
-    topics = body.get("topics", [])
-    urls = []
-    file_key_map = {}
-    for f in input_files:
-        key = f.get("key", "")
-        url = f.get("url", "")
-        urls.append(url)
-        file_key_map[url] = key
-    batch_results = processor.process_batch(urls)  # {url: markdown_string}
-    md_context = []
-    for url, md_content in batch_results.items():
-        key = file_key_map.get(url, "")
-        md_context.append({"key": key, "body": md_content})
-    out_headers = {
-        "request_type": "question_extraction_update_from_gpu_server",
-        "request_id": request_id
-    }
-    out_body = {
-        "input_files": input_files,
-        "topics": topics,
-        "md_context": md_context
-    }
-    final_json = {
-        "headers": out_headers,
-        "body": out_body
-    }
-    return json.dumps(final_json, ensure_ascii=False), final_json
-def callback(ch, method, properties, body):
-    """
-    This function is invoked for each incoming RabbitMQ message.
-    """
-    thread_id = threading.current_thread().name
-    headers = properties.headers or {}
-    print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
-    # If the header "process" is "topic_extraction", we run our pipeline
-    if headers.get("process") == "topic_extraction":
-        raw_text_outputs, parsed_json_outputs = run_pipeline(body)
-        # Do something with the result, e.g. print or store in DB
-        print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
-    else:
-        # Fallback if "process" is something else
-        print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
-        time.sleep(10)
-        print("[Worker] Done")
-def worker(channel):
-    try:
-        channel.start_consuming()
-    except Exception as e:
-        print(f"[Worker] Error: {e}")
-def connect_to_rabbitmq():
-    rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
-    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
-    channel = connection.channel()
-    # Declare the queue
-    channel.queue_declare(queue="ml_server", durable=True)
-    # Limit messages per worker
-    channel.basic_qos(prefetch_count=1)
-    # auto_ack=True for simplicity, else you must ack manually
-    channel.basic_consume(
-        queue="ml_server",
-        on_message_callback=callback,
-        auto_ack=True
-    )
-    return connection, channel
-def main():
-    """
-    Main entry: starts multiple worker threads to consume from the queue.
-    """
-    num_workers = 2 # hard code for now
-    print(f"Starting {num_workers} workers")
-    with ThreadPoolExecutor(max_workers=num_workers) as executor:
-        for _ in range(num_workers):
-            connection, channel = connect_to_rabbitmq()
-            executor.submit(worker, channel)
-if __name__ == "__main__":
-    """
-    If run directly, we also publish a test message, then start the workers.
-    """
-    rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
-    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
-    channel = connection.channel()
-    channel.queue_declare(queue="ml_server", durable=True)
-    sample_message = {
-        "headers": {
-            "request_type": "process_files",
-            "request_id": "abc123"
-        },
-        "body": {
-            "input_files": [
-                {
-                    "key": "file1",
-                    "url": "https://example.com/file1.pdf",
-                    "type": "mark_scheme"
-                },
-                {
-                    "key": "file2",
-                    "url": "https://example.com/file2.pdf",
-                    "type": "question"
-                }
-            ],
-            "topics": [
-                {
-                    "title": "Algebra",
-                    "id": 123
-                }
-            ]
         }
-    }
-    channel.basic_publish(
-        exchange="",
-        routing_key="ml_server",
-        body=json.dumps(sample_message),
-        properties=pika.BasicProperties(
-            headers={"process": "topic_extraction"}
         )
-    )
-    connection.close()
-    main()

 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
 import pika
+from typing import Tuple, Dict, Any
 from mineru_single import Processor
+class MessageProcessor:
+    def __init__(self):
+        self.processor = Processor()
+    def process_message(self, body_bytes: bytes) -> Tuple[str, Dict[str, Any]]:
+        """Process incoming message and return processed results"""
+        body_str = body_bytes.decode("utf-8")
+        data = json.loads(body_str)
+        headers = data.get("headers", {})
+        request_type = headers.get("request_type", "")
+        request_id = headers.get("request_id", "")
+        body = data.get("body", {})
+        if request_type != "process_files":
+            return "No processing done", data
+        input_files = body.get("input_files", [])
+        topics = body.get("topics", [])
+        urls, file_key_map = self._extract_urls_and_keys(input_files)
+        batch_results = self.processor.process_batch(urls)
+        md_context = self._create_markdown_context(batch_results, file_key_map)
+        final_json = self._create_response_json(request_id, input_files, topics, md_context)
+        return json.dumps(final_json, ensure_ascii=False), final_json
+    def _extract_urls_and_keys(self, input_files: list) -> Tuple[list, dict]:
+        """Extract URLs and create file key mapping"""
+        urls = []
+        file_key_map = {}
+        for f in input_files:
+            key = f.get("key", "")
+            url = f.get("url", "")
+            urls.append(url)
+            file_key_map[url] = key
+        return urls, file_key_map
+    def _create_markdown_context(self, batch_results: dict, file_key_map: dict) -> list:
+        """Create markdown context from batch results"""
+        md_context = []
+        for url, md_content in batch_results.items():
+            key = file_key_map.get(url, "")
+            md_context.append({"key": key, "body": md_content})
+        return md_context
+    def _create_response_json(self, request_id: str, input_files: list,
+                            topics: list, md_context: list) -> dict:
+        """Create the final response JSON"""
+        return {
+            "headers": {
+                "request_type": "question_extraction_update_from_gpu_server",
+                "request_id": request_id
+            },
+            "body": {
+                "input_files": input_files,
+                "topics": topics,
+                "md_context": md_context
+            }
         }
+class RabbitMQWorker:
+    def __init__(self, num_workers: int = 1):
+        self.num_workers = num_workers
+        self.message_processor = MessageProcessor()
+    def callback(self, ch, method, properties, body):
+        """Handle incoming RabbitMQ messages"""
+        thread_id = threading.current_thread().name
+        headers = properties.headers or {}
+        print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
+        if headers.get("process") == "topic_extraction":
+            raw_text_outputs, parsed_json_outputs = self.message_processor.process_message(body)
+            print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
+        else:
+            print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
+            time.sleep(10)
+            print("[Worker] Done")
+    def worker(self, channel):
+        """Worker process to consume messages"""
+        try:
+            channel.start_consuming()
+        except Exception as e:
+            print(f"[Worker] Error: {e}")
+    def connect_to_rabbitmq(self):
+        """Establish connection to RabbitMQ"""
+        rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
+        connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
+        channel = connection.channel()
+        channel.queue_declare(queue="ml_server", durable=True)
+        channel.basic_qos(prefetch_count=1)
+        channel.basic_consume(
+            queue="ml_server",
+            on_message_callback=self.callback,
+            auto_ack=True
         )
+        return connection, channel
+    def start(self):
+        """Start the worker threads"""
+        print(f"Starting {self.num_workers} workers")
+        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
+            for _ in range(self.num_workers):
+                connection, channel = self.connect_to_rabbitmq()
+                executor.submit(self.worker, channel)
+def main():
+    worker = RabbitMQWorker()
+    worker.start()