from mcp.server.fastmcp import FastMCP import os from typing import Optional, List, Any, Dict from huggingface_hub import InferenceClient # Initialize the MCP server mcp = FastMCP("Hugging Face tools") # Get token from environment HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: print("Warning: HF_TOKEN environment variable not set. Some authenticated requests may fail.") client = InferenceClient(token=HF_TOKEN) @mcp.tool() def list_available_tasks() -> str: """Lists all the AI tasks supported by this server.""" tasks = [ "Audio-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Image", "Image-Text-to-Video", "Visual Question Answering", "Document Question Answering", "Video-Text-to-Text", "Visual Document Retrieval", "Depth Estimation", "Image Classification", "Object Detection", "Image Segmentation", "Text-to-Image", "Image-to-Text", "Image-to-Image", "Image-to-Video", "Unconditional Image Generation", "Video Classification", "Text-to-Video", "Zero-Shot Image Classification", "Mask Generation", "Zero-Shot Object Detection", "Text-to-3D", "Image-to-3D", "Image Feature Extraction", "Keypoint Detection", "Video-to-Video", "Text Classification", "Token Classification", "Table Question Answering", "Question Answering", "Zero-Shot Classification", "Translation", "Summarization", "Feature Extraction", "Text Generation", "Fill-Mask", "Sentence Similarity", "Text Ranking", "Text-to-Speech", "Text-to-Audio", "Automatic Speech Recognition", "Audio-to-Audio", "Audio Classification", "Voice Activity Detection", "Tabular Classification", "Tabular Regression", "Time Series Forecasting", "Reinforcement Learning", "Robotics", "Graph Machine Learning" ] return f"Supported Tasks: {', '.join(tasks)}" @mcp.tool() def visual_question_answering(image: str, question: str, model: Optional[str] = None) -> str: """ Answer questions about an image. Args: image: URL or Base64 string of the image. question: The question to answer. model: Optional model ID (e.g., 'dandelin/vilt-b32-finetuned-vqa'). """ try: # Note: client.visual_question_answering takes URL/path or bytes/PIL, but for robustness we might pass URL directly if supported # or decode. utils.decode_image returns a PIL Image. # InferenceClient.visual_question_answering supports: image: Union[str, Path, bytes, BinaryIO] # If it's a URL, we can pass it directly. If it's B64, we need to decode. # For simplicity, let's decode everything to confirm it's valid, relying on utils. # Wait, utils needs 'requests' which is not in pyproject.toml yet. I need to add it or use urllib. # Actually client handles URLs. result = client.visual_question_answering(image, question, model=model) # Result is typically a list of dicts or a single object depending on api return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def text_to_image(prompt: str, model: Optional[str] = None) -> str: """ Generate an image from text. Returns: Base64 encoded image string. """ try: img = client.text_to_image(prompt, model=model) # Check if img is a PIL Image, sometimes it's bytes import utils if not isinstance(img, utils.Image.Image): # It might be bytes import io img = utils.Image.open(io.BytesIO(img)) return utils.encode_image(img) except Exception as e: return f"Error: {e}" @mcp.tool() def image_classification(image: str, model: Optional[str] = None) -> str: """ Classify an image. Args: image: URL or Base64 string. """ try: result = client.image_classification(image, model=model) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def object_detection(image: str, model: Optional[str] = None) -> str: """ Detect objects in an image. Args: image: URL or Base64 string. """ try: result = client.object_detection(image, model=model) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def image_to_text(image: str, model: Optional[str] = None) -> str: """ Generate a caption or text description for an image. Args: image: URL or Base64 string. """ try: result = client.image_to_text(image, model=model) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def text_generation(prompt: str, model: Optional[str] = None, max_new_tokens: int = 500) -> str: """ Generate text based on a prompt. Args: prompt: Input text. model: Model ID. max_new_tokens: Maximum tokens to generate. """ try: return client.text_generation(prompt, model=model, max_new_tokens=max_new_tokens) except Exception as e: return f"Error: {e}" @mcp.tool() def summarization(text: str, model: Optional[str] = None) -> str: """ Summarize a text. """ try: result = client.summarization(text, model=model) # Result is typically a list containing {'summary_text': ...} if isinstance(result, list) and len(result) > 0: return result[0].get('summary_text', str(result)) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def translation(text: str, model: Optional[str] = None) -> str: """ Translate text. Model usually determines source/target languages. """ try: # Note: InferenceClient translation often expects src_lang/tgt_lang depending on model, # but the simple API just takes text. result = client.translation(text, model=model) if isinstance(result, list) and len(result) > 0: return result[0].get('translation_text', str(result)) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def text_classification(text: str, model: Optional[str] = None) -> str: """ Classify text (e.g. sentiment analysis). """ try: result = client.text_classification(text, model=model) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def automatic_speech_recognition(audio: str, model: Optional[str] = None) -> str: """ Transcribe audio. Args: audio: URL or Base64 string of the audio file. """ try: # client.automatic_speech_recognition handles URLs/bytes # If URL, pass directly. If not, maybe need to decode bytes? # ASR usually takes bytes or filename. # If base64 provided, we should decode. import base64 if not (audio.startswith("http://") or audio.startswith("https://")): audio_data = base64.b64decode(audio) result = client.automatic_speech_recognition(audio_data, model=model) else: result = client.automatic_speech_recognition(audio, model=model) if isinstance(result, dict): return result.get('text', str(result)) return str(result) except Exception as e: return f"Error: {e}" @mcp.tool() def text_to_speech(text: str, model: Optional[str] = None) -> str: """ Generate audio from text. Returns: Base64 encoded audio. """ try: audio_bytes = client.text_to_speech(text, model=model) import base64 return base64.b64encode(audio_bytes).decode('utf-8') except Exception as e: return f"Error: {e}" @mcp.tool() def generic_hf_inference(task: str, inputs: Dict[str, Any], model: Optional[str] = None) -> str: """ Run any Hugging Face inference task that doesn't have a specific tool. Args: task: The task name (e.g., 'text-generation', 'translation'). inputs: Dictionary of inputs required for the task. model: Model ID to use. """ try: # We can use client.post for raw access # but parameters depend heavily on the task. # This is a fallback. import json result = client.post(json=inputs, model=model, task=task) return str(result) except Exception as e: return f"Error: {e}"