MinerU

Paused

App Files Files Community

princhman commited on Feb 17, 2025

Commit

73d131e

verified ·

1 Parent(s): b544f5d

update worker.py

Browse files

Files changed (1) hide show

worker.py +125 -125

worker.py CHANGED Viewed

@@ -1,126 +1,126 @@
-#!/usr/bin/env python3
-import os
-import json
-import time
-import threading
-import multiprocessing
-from concurrent.futures import ThreadPoolExecutor
-import pika
-from typing import Tuple, Dict, Any
-from mineru_single import Processor
-class MessageProcessor:
-    def __init__(self):
-        self.processor = Processor()
-    def process_message(self, body_bytes: bytes) -> Tuple[str, Dict[str, Any]]:
-        """Process incoming message and return processed results"""
-        body_str = body_bytes.decode("utf-8")
-        data = json.loads(body_str)
-        headers = data.get("headers", {})
-        request_type = headers.get("request_type", "")
-        request_id = headers.get("request_id", "")
-        body = data.get("body", {})
-        if request_type != "process_files":
-            return "No processing done", data
-        input_files = body.get("input_files", [])
-        topics = body.get("topics", [])
-        urls, file_key_map = self._extract_urls_and_keys(input_files)
-        batch_results = self.processor.process_batch(urls)
-        md_context = self._create_markdown_context(batch_results, file_key_map)
-        final_json = self._create_response_json(request_id, input_files, topics, md_context)
-        return json.dumps(final_json, ensure_ascii=False), final_json
-    def _extract_urls_and_keys(self, input_files: list) -> Tuple[list, dict]:
-        """Extract URLs and create file key mapping"""
-        urls = []
-        file_key_map = {}
-        for f in input_files:
-            key = f.get("key", "")
-            url = f.get("url", "")
-            urls.append(url)
-            file_key_map[url] = key
-        return urls, file_key_map
-    def _create_markdown_context(self, batch_results: dict, file_key_map: dict) -> list:
-        """Create markdown context from batch results"""
-        md_context = []
-        for url, md_content in batch_results.items():
-            key = file_key_map.get(url, "")
-            md_context.append({"key": key, "body": md_content})
-        return md_context
-    def _create_response_json(self, request_id: str, input_files: list,
-                            topics: list, md_context: list) -> dict:
-        """Create the final response JSON"""
-        return {
-            "headers": {
-                "request_type": "question_extraction_update_from_gpu_server",
-                "request_id": request_id
-            },
-            "body": {
-                "input_files": input_files,
-                "topics": topics,
-                "md_context": md_context
-            }
-        }
-class RabbitMQWorker:
-    def __init__(self, num_workers: int = 1):
-        self.num_workers = num_workers
-        self.message_processor = MessageProcessor()
-    def callback(self, ch, method, properties, body):
-        """Handle incoming RabbitMQ messages"""
-        thread_id = threading.current_thread().name
-        headers = properties.headers or {}
-        print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
-        if headers.get("process") == "topic_extraction":
-            raw_text_outputs, parsed_json_outputs = self.message_processor.process_message(body)
-            print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
-        else:
-            print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
-            time.sleep(10)
-            print("[Worker] Done")
-    def worker(self, channel):
-        """Worker process to consume messages"""
-        try:
-            channel.start_consuming()
-        except Exception as e:
-            print(f"[Worker] Error: {e}")
-    def connect_to_rabbitmq(self):
-        """Establish connection to RabbitMQ"""
-        rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
-        connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
-        channel = connection.channel()
-        channel.queue_declare(queue="ml_server", durable=True)
-        channel.basic_qos(prefetch_count=1)
-        channel.basic_consume(
-            queue="ml_server",
-            on_message_callback=self.callback,
-            auto_ack=True
-        )
-        return connection, channel
-    def start(self):
-        """Start the worker threads"""
-        print(f"Starting {self.num_workers} workers")
-        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
-            for _ in range(self.num_workers):
-                connection, channel = self.connect_to_rabbitmq()
-                executor.submit(self.worker, channel)
-def main():
-    worker = RabbitMQWorker()
     worker.start()

+#!/usr/bin/env python3
+import os
+import json
+import time
+import threading
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+import pika
+from typing import Tuple, Dict, Any
+from mineru_single import Processor
+class MessageProcessor:
+    def __init__(self):
+        self.processor = Processor()
+    def process_message(self, body_bytes: bytes) -> Tuple[str, Dict[str, Any]]:
+        """Process incoming message and return processed results"""
+        body_str = body_bytes.decode("utf-8")
+        data = json.loads(body_str)
+        headers = data.get("headers", {})
+        request_type = headers.get("request_type", "")
+        request_id = headers.get("request_id", "")
+        body = data.get("body", {})
+        if request_type != "process_files":
+            return "No processing done", data
+        input_files = body.get("input_files", [])
+        topics = body.get("topics", [])
+        urls, file_key_map = self._extract_urls_and_keys(input_files)
+        batch_results = self.processor.process_batch(urls)
+        md_context = self._create_markdown_context(batch_results, file_key_map)
+        final_json = self._create_response_json(request_id, input_files, topics, md_context)
+        return json.dumps(final_json, ensure_ascii=False), final_json
+    def _extract_urls_and_keys(self, input_files: list) -> Tuple[list, dict]:
+        """Extract URLs and create file key mapping"""
+        urls = []
+        file_key_map = {}
+        for f in input_files:
+            key = f.get("key", "")
+            url = f.get("url", "")
+            urls.append(url)
+            file_key_map[url] = key
+        return urls, file_key_map
+    def _create_markdown_context(self, batch_results: dict, file_key_map: dict) -> list:
+        """Create markdown context from batch results"""
+        md_context = []
+        for url, md_content in batch_results.items():
+            key = file_key_map.get(url, "")
+            md_context.append({"key": key, "body": md_content})
+        return md_context
+    def _create_response_json(self, request_id: str, input_files: list,
+                            topics: list, md_context: list) -> dict:
+        """Create the final response JSON"""
+        return {
+            "headers": {
+                "request_type": "question_extraction_update_from_gpu_server",
+                "request_id": request_id
+            },
+            "body": {
+                "input_files": input_files,
+                "topics": topics,
+                "md_context": md_context
+            }
+        }
+class RabbitMQWorker:
+    def __init__(self, num_workers: int = 1):
+        self.num_workers = num_workers
+        self.message_processor = MessageProcessor()
+    def callback(self, ch, method, properties, body):
+        """Handle incoming RabbitMQ messages"""
+        thread_id = threading.current_thread().name
+        headers = properties.headers or {}
+        print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
+        if headers.get("process") == "topic_extraction":
+            raw_text_outputs, parsed_json_outputs = self.message_processor.process_message(body)
+            print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
+        else:
+            print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
+            time.sleep(10)
+            print("[Worker] Done")
+    def worker(self, channel):
+        """Worker process to consume messages"""
+        try:
+            channel.start_consuming()
+        except Exception as e:
+            print(f"[Worker] Error: {e}")
+    def connect_to_rabbitmq(self):
+        """Establish connection to RabbitMQ"""
+        rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
+        connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
+        channel = connection.channel()
+        channel.queue_declare(queue="gpu_server", durable=True)
+        channel.basic_qos(prefetch_count=1)
+        channel.basic_consume(
+            queue="gpu_server",
+            on_message_callback=self.callback,
+            auto_ack=True
+        )
+        return connection, channel
+    def start(self):
+        """Start the worker threads"""
+        print(f"Starting {self.num_workers} workers")
+        with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
+            for _ in range(self.num_workers):
+                connection, channel = self.connect_to_rabbitmq()
+                executor.submit(self.worker, channel)
+def main():
+    worker = RabbitMQWorker()
     worker.start()