File size: 8,644 Bytes
f8ba0eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os
import uuid
import time
import psutil
import uvicorn
import torch
import cv2
import shutil
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from video_processor import extract_frames, FrameSamplingMethod
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-VL-3B-Instruct-AWQ")
args = parser.parse_args()
try:
    import pynvml
    pynvml.nvmlInit()
    GPU_METRICS_AVAILABLE = True
except (ImportError, pynvml.NVMLError):
    GPU_METRICS_AVAILABLE = False

import logging


# --- 日志和临时文件目录配置 ---
LOG_DIR = f"logs/{args.model_path.split('/')[-1]}"
TEMP_VIDEO_DIR = "temp_videos"
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)

log_filename = f"{LOG_DIR}/{time.strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename=log_filename, filemode='a')

# --- FastAPI 应用初始化 ---
app = FastAPI(title="Qwen2.5-VL-AWQ Video Inference Service")

# --- 加载模型和处理器 ---
logging.info(f"Loading model: {args.model_path}")
model_load_start = time.time()

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    args.model_path, 
    torch_dtype='float16', 
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(args.model_path)

logging.info(f"Model loaded in {time.time() - model_load_start:.2f} seconds")

@app.post("/video-inference/")
async def video_inference(
   prompt: str = Form(...),
   video_file: UploadFile = File(...),
   sampling_method: FrameSamplingMethod = Form(FrameSamplingMethod.CONTENT_AWARE),
   sampling_rate: int = Form(5),
):
    """
    接收视频和文本提示,进行推理并返回结果。
    """
    request_start_time = time.time()
    request_id = str(uuid.uuid4())
    logging.info(f"[{request_id}] Received new video inference request. Prompt: '{prompt}', Video: '{video_file.filename}'")

    if not video_file.content_type.startswith("video/"):
        logging.error(f"[{request_id}] Uploaded file '{video_file.filename}' is not a video. Content-Type: {video_file.content_type}")
        raise HTTPException(status_code=400, detail="Uploaded file is not a video.")

    file_extension = os.path.splitext(video_file.filename)[1]
    temp_video_path = os.path.join(TEMP_VIDEO_DIR, f"{request_id}{file_extension}")
    temp_frame_dir = os.path.join(TEMP_VIDEO_DIR, request_id)
    os.makedirs(temp_frame_dir, exist_ok=True)

    try:
        # 1. 保存视频并提取帧
        with open(temp_video_path, "wb") as buffer:
            content = await video_file.read()
            buffer.write(content)
        
        logging.info(f"[{request_id}] Video saved to temporary file: {temp_video_path}")
        logging.info(f"[{request_id}] Extracting frames using method: {sampling_method.value}, rate/threshold: {sampling_rate}")
        
        frames = extract_frames(temp_video_path, sampling_method, sampling_rate)
        if not frames:
            logging.error(f"[{request_id}] Could not extract any frames from the video: {temp_video_path}")
            raise HTTPException(status_code=400, detail="Could not extract any frames from the video.")
        
        logging.info(f"[{request_id}] Extracted {len(frames)} frames successfully. Saving to temporary files...")

        # 将帧保存到临时文件并获取其路径
        frame_paths = []
        for i, frame in enumerate(frames):
            frame_path = os.path.join(temp_frame_dir, f"frame_{i:04d}.jpg")
            cv2.imwrite(frame_path, frame)
            abs_frame_path = os.path.abspath(frame_path)
            frame_paths.append(f"file://{abs_frame_path}")

        logging.info(f"[{request_id}] {len(frame_paths)} frames saved to {temp_frame_dir}")

        # 2. 构造 video 输入
        content = [
            {
                "type": "video",
                "video": frame_paths,
                "resized_height": 280,
                "resized_width": 420,
            },
            {"type": "text", "text": prompt},
        ]
        messages = [{"role": "user", "content": content}]

        # 3. 准备推理输入
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        logging.info(f"[{request_id}] Video processing finished.")
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(model.device)
        logging.info(f"[{request_id}] Input tokens: {len(inputs.input_ids[0])}")
        # 4. 执行推理
        logging.info(f"[{request_id}] Sending request to model '{args.model_path}'...")
        psutil.cpu_percent(interval=None)
        psutil.cpu_percent(interval=None, percpu=True)
        inference_start_time = time.time()

        generated_ids = model.generate(**inputs, max_new_tokens=512)
        
        inference_end_time = time.time()
        cpu_usage = psutil.cpu_percent(interval=None)
        cpu_core_utilization = psutil.cpu_percent(interval=None, percpu=True)

        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

        logging.info(f"[{request_id}] Received response from model successfully.")

        # --- 指标计算 ---
        total_request_processing_time = time.time() - request_start_time
        model_inference_latency = inference_end_time - inference_start_time
        num_generated_tokens = len(generated_ids_trimmed[0])
        tokens_per_second = num_generated_tokens / model_inference_latency if model_inference_latency > 0 else 0

        cpu_freq_info = psutil.cpu_freq()
        cpu_freq = cpu_freq_info.current if cpu_freq_info else 'N/A'

        gpu_metrics_log = "Not available (pynvml not installed or NVIDIA driver issue)"
        if GPU_METRICS_AVAILABLE:
            try:
                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                gpu_metrics_log = f"GPU Utilization: {utilization.gpu}%, Memory Used: {memory_info.used / (1024**2):.2f}/{memory_info.total / (1024**2):.2f} MB"
            except pynvml.NVMLError as e:
                gpu_metrics_log = f"Could not retrieve GPU metrics: {e}"

        # --- 格式化日志 ---
        log_message = f"""
[{request_id}] --- Performance & System Metrics ---
  [Request Info]
    - Prompt: "{prompt}"
    - Model: {args.model_path}
    - Sampling Method: {sampling_method.value}, Rate: {sampling_rate}, Frames: {len(frames)}
  [Latency & Throughput]
    - Tokens/Second: {tokens_per_second:.2f}
    - Latency (Model Inference): {model_inference_latency:.4f} s
    - Batch Processing Latency (Total Request Time): {total_request_processing_time:.4f} s
    - Throughput (for this request): {1/total_request_processing_time if total_request_processing_time > 0 else float('inf'):.2f} req/s
  [Token Usage]
    - Prompt Tokens: {len(inputs.input_ids[0])}
    - Response Tokens: {num_generated_tokens}
  [System Usage at Completion]
    - CPU Usage: {cpu_usage}%
    - CPU Core Utilization: {cpu_core_utilization}%
    - CPU Frequency: {cpu_freq} MHz
    - GPU: {gpu_metrics_log}
  [Response]
    - {output_text}
----------------------------------------------------"""
        logging.info(log_message)
        
        return JSONResponse(content={"response": output_text})

    except Exception as e:
        logging.error(f"[{request_id}] An error occurred during processing: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"An error occurred during processing: {str(e)}")
    finally:
        if os.path.exists(temp_video_path):
            os.remove(temp_video_path)
            logging.info(f"[{request_id}] Cleaned up temporary file: {temp_video_path}")
        if os.path.exists(temp_frame_dir):
            shutil.rmtree(temp_frame_dir)
            logging.info(f"[{request_id}] Cleaned up temporary frame directory: {temp_frame_dir}")


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8010)