Spaces:

tomo2chin2
/

HTML2PDF_API

Paused

App Files Files Community

tomo2chin2 commited on Jul 5, 2025

Commit

f107614

verified ·

1 Parent(s): 371916d

Add scale=0.9 to PDF generation for better A4 page fitting

Browse files

Files changed (1) hide show

app.py +130 -0

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import os
 import uuid
 import tempfile
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
 from fastapi import FastAPI, HTTPException, File, UploadFile
 from fastapi.responses import JSONResponse, RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -17,6 +20,10 @@ class HTMLRequest(BaseModel):
     html_content: str
 class PDFResponse(BaseModel):
     pdf_url: str
     filename: str
@@ -24,6 +31,91 @@ class PDFResponse(BaseModel):
     repository_url: str
 async def html_to_pdf_api(html_content: str) -> tuple[str, str]:
     """
     HTMLコンテンツをPDFに変換してHugging Faceデータセットリポジトリにアップロード
@@ -128,6 +220,7 @@ async def root():
         "storage": f"Hugging Face Dataset Repository: {hf_repo_id}",
         "endpoints": {
             "convert": "/convert - HTMLをPDFに変換してHFリポジトリに保存",
             "files": "/files - HFリポジトリ内のPDFファイル一覧",
             "docs": "/docs - API仕様書",
             "health": "/health - ヘルスチェック"
@@ -192,6 +285,43 @@ async def health_check():
     return {"status": "healthy", "timestamp": datetime.now().isoformat()}
 @app.get("/files")
 async def list_files():
     """

 import os
 import uuid
 import tempfile
+import json
+import re
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
+import httpx
 from fastapi import FastAPI, HTTPException, File, UploadFile
 from fastapi.responses import JSONResponse, RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
     html_content: str
+class YAMLRequest(BaseModel):
+    yaml_content: str
 class PDFResponse(BaseModel):
     pdf_url: str
     filename: str
     repository_url: str
+async def call_gemini_api(yaml_content: str) -> str:
+    """
+    Gemini APIを呼び出してYAMLコンテンツからHTMLを生成
+    """
+    try:
+        # 環境変数から設定を取得
+        gemini_api_key = os.getenv("GEMINI_API_KEY")
+        model_id = os.getenv("MODEL", "gemini-2.5-flash")
+        system_instruction = os.getenv("SYSTEM", "YAMLデータを基にHTMLを生成してください。")
+        if not gemini_api_key:
+            raise HTTPException(status_code=500, detail="GEMINI_API_KEY環境変数が設定されていません")
+        # Gemini APIリクエストボディを構築
+        request_body = {
+            "contents": [
+                {
+                    "role": "user",
+                    "parts": [
+                        {
+                            "text": system_instruction
+                        },
+                    ]
+                },
+                {
+                    "role": "model",
+                    "parts": [
+                        {
+                            "text": "はい、システムインストラクションとして認識しました。\nYAMLデータを基にHTMLを生成いたします。"
+                        },
+                    ]
+                },
+                {
+                    "role": "user",
+                    "parts": [
+                        {
+                            "text": yaml_content
+                        },
+                    ]
+                },
+            ],
+            "generationConfig": {
+                "temperature": 0.75,
+                "responseMimeType": "text/plain",
+            },
+        }
+        # Gemini APIを呼び出し
+        url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_id}:generateContent?key={gemini_api_key}"
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(url, json=request_body)
+            response.raise_for_status()
+            # レスポンスからHTMLを抽出
+            result = response.json()
+            # レスポンス構造を解析してHTMLを抽出
+            if "candidates" in result and len(result["candidates"]) > 0:
+                candidate = result["candidates"][0]
+                if "content" in candidate and "parts" in candidate["content"]:
+                    parts = candidate["content"]["parts"]
+                    for part in parts:
+                        if "text" in part:
+                            text = part["text"]
+                            # HTMLタグを含む部分を抽出
+                            html_match = re.search(r'<html.*?>.*?</html>', text, re.DOTALL | re.IGNORECASE)
+                            if html_match:
+                                return html_match.group(0)
+                            # HTML全体がない場合はbodyタグを探す
+                            body_match = re.search(r'<body.*?>.*?</body>', text, re.DOTALL | re.IGNORECASE)
+                            if body_match:
+                                return f"<html><head><meta charset='utf-8'></head>{body_match.group(0)}</html>"
+                            # それでもない場合は全体をHTMLとして扱う
+                            if "<" in text and ">" in text:
+                                return text
+            raise HTTPException(status_code=500, detail="Gemini APIレスポンスからHTMLを抽出できませんでした")
+    except httpx.HTTPError as e:
+        raise HTTPException(status_code=500, detail=f"Gemini API呼び出しエラー: {str(e)}")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Gemini API処理中にエラーが発生しました: {str(e)}")
 async def html_to_pdf_api(html_content: str) -> tuple[str, str]:
     """
     HTMLコンテンツをPDFに変換してHugging Faceデータセットリポジトリにアップロード
         "storage": f"Hugging Face Dataset Repository: {hf_repo_id}",
         "endpoints": {
             "convert": "/convert - HTMLをPDFに変換してHFリポジトリに保存",
+            "convert-yaml": "/convert-yaml - YAMLをGemini APIでHTMLに変換してPDF化",
             "files": "/files - HFリポジトリ内のPDFファイル一覧",
             "docs": "/docs - API仕様書",
             "health": "/health - ヘルスチェック"
     return {"status": "healthy", "timestamp": datetime.now().isoformat()}
+@app.post("/convert-yaml", response_model=PDFResponse)
+async def convert_yaml_to_pdf(request: YAMLRequest):
+    """
+    YAMLコンテンツをGemini APIでHTMLに変換し、PDFを生成してHugging Faceデータセットリポジトリに保存するエンドポイント
+    - **yaml_content**: 変換するYAMLコンテンツ
+    Returns:
+    - **pdf_url**: 生成されたPDFのダウンロードURL（Hugging Face上）
+    - **filename**: PDFファイル名
+    - **message**: 処理結果メッセージ
+    - **repository_url**: リポジトリURL
+    """
+    if not request.yaml_content or not request.yaml_content.strip():
+        raise HTTPException(status_code=400, detail="YAMLコンテンツが空です")
+    try:
+        # YAMLからGemini APIでHTMLを生成
+        html_content = await call_gemini_api(request.yaml_content)
+        # 生成されたHTMLをPDFに変換
+        filename, pdf_url = await html_to_pdf_api(html_content)
+        hf_repo_id = os.getenv("HF_DATASET_REPO_ID")
+        repository_url = f"https://huggingface.co/datasets/{hf_repo_id}"
+        return PDFResponse(
+            pdf_url=pdf_url,
+            filename=filename,
+            message=f"✅ YAMLからPDFが正常に生成され、Hugging Faceリポジトリに保存されました: {filename}",
+            repository_url=repository_url
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"処理中にエラーが発生しました: {str(e)}")
 @app.get("/files")
 async def list_files():
     """