Spaces:
Paused
Paused
| import base64 | |
| import mimetypes | |
| import os | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| import gradio as gr | |
| from openai import OpenAI | |
| DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "qianfan-vl-70b") | |
| _client = OpenAI( | |
| base_url="https://qianfan.baidubce.com/v2", | |
| api_key=os.getenv("QIANFAN_API_KEY", ""), | |
| ) | |
| def _data_url(path: str) -> str: | |
| mime, _ = mimetypes.guess_type(path) | |
| mime = mime or "application/octet-stream" | |
| data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") | |
| return f"data:{mime};base64,{data}" | |
| def _image_content(path: str) -> Dict[str, Any]: | |
| return {"type": "image_url", "image_url": {"url": _data_url(path)}} | |
| def _text_content(text: str) -> Dict[str, Any]: | |
| return {"type": "text", "text": text} | |
| def _message(role: str, content: Any) -> Dict[str, Any]: | |
| return {"role": role, "content": content} | |
| def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: | |
| files = message.get("files") or [] | |
| text = (message.get("text") or "").strip() | |
| content: List[Dict[str, Any]] = [_image_content(p) for p in files] | |
| if text: | |
| content.append(_text_content(text)) | |
| return _message("user", content) | |
| def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| msgs: List[Dict[str, Any]] = [] | |
| user_content: List[Dict[str, Any]] = [] | |
| for turn in history or []: | |
| role, content = turn.get("role"), turn.get("content") | |
| if role == "user": | |
| if isinstance(content, str): | |
| user_content.append(_text_content(content)) | |
| elif isinstance(content, tuple): | |
| user_content.extend(_image_content(path) for path in content if path) | |
| elif role == "assistant": | |
| msgs.append(_message("user", user_content.copy())) | |
| user_content.clear() | |
| msgs.append(_message("assistant", content)) | |
| return msgs | |
| def _add_image_tokens(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| new_messages = [] | |
| for message in messages: | |
| if message.get("role") != "user": | |
| new_messages.append(message) | |
| continue | |
| content = message.get("content", []) | |
| if not isinstance(content, list): | |
| new_messages.append(message) | |
| continue | |
| image_count = sum(1 for item in content if item.get("type") == "image_url") | |
| image_tokens = "<image>" * image_count | |
| new_content = [] | |
| for item in content: | |
| if item.get("type") == "image_url": | |
| new_content.append(item) | |
| elif item.get("type") == "text": | |
| text = image_tokens + item.get("text", "") | |
| new_content.append(_text_content(text)) | |
| new_messages.append(_message("user", new_content)) | |
| return new_messages | |
| def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): | |
| messages = _convert_history(history) | |
| messages.append(_build_user_message(message)) | |
| messages = _add_image_tokens(messages) | |
| try: | |
| stream = _client.chat.completions.create( | |
| model=model_name, | |
| messages=messages, | |
| temperature=0.000001, | |
| top_p=1, | |
| extra_body={ | |
| "repetition_penalty":1.05, | |
| "frequency_penalty":0, | |
| "presence_penalty":0 | |
| }, | |
| stream=True | |
| ) | |
| partial = "" | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content | |
| if delta: | |
| partial += delta | |
| yield partial | |
| except Exception as e: | |
| yield f"Failed to get response: {e}" | |
| def build_demo() -> gr.Blocks: | |
| chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) | |
| textbox = gr.MultimodalTextbox( | |
| show_label=False, | |
| placeholder="Enter text, or upload one or more images...", | |
| file_types=["image"], | |
| file_count="multiple", | |
| max_plain_text_length=32768 | |
| ) | |
| model_selector = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| ("Qianfan-VL-70B", "qianfan-vl-70b"), | |
| ("Qianfan-VL-8B", "qianfan-vl-8b"), | |
| ], | |
| value=DEFAULT_MODEL, | |
| ) | |
| return gr.ChatInterface( | |
| fn=stream_response, | |
| type="messages", | |
| multimodal=True, | |
| chatbot=chatbot, | |
| textbox=textbox, | |
| title="Qianfan-VL: Domain-Enhanced Vision-Language Models", | |
| description="""**Qianfan-VL** is a series of general-purpose multimodal large language models enhanced for enterprise-level multimodal applications. The models offer deep optimization for high-frequency scenarios in industrial deployment while maintaining strong general capabilities. | |
| 🔗 **Links**: [GitHub](https://github.com/baidubce/Qianfan-VL) | [HuggingFace](https://huggingface.co/baidu) | [ModelScope](https://modelscope.cn/organization/baidu-qianfan) | [Documentation](https://github.com/baidubce/qianfan-models-cookbook)""", | |
| additional_inputs=[model_selector], | |
| additional_inputs_accordion=gr.Accordion("Options", open=True), | |
| ).queue(default_concurrency_limit=8) | |
| def main(): | |
| build_demo().launch() | |
| if __name__ == "__main__": | |
| main() |