rohitkshirsagar19 commited on
Commit
301dd5b
Β·
verified Β·
1 Parent(s): ec760d2

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +35 -0
  2. main.py +167 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── PromptZip FastAPI β€” Hugging Face Docker Space ────────────────────────────
2
+ # HF Spaces requires the container to listen on port 7860.
3
+ # The Space runs as a non-root user (UID 1000), so we create one here.
4
+
5
+ FROM python:3.11-slim
6
+
7
+ # ---------- System deps ----------
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ build-essential \
10
+ curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # ---------- Non-root user (matches HF Spaces UID) ----------
14
+ RUN useradd -m -u 1000 appuser
15
+ WORKDIR /app
16
+
17
+ # ---------- Python deps ----------
18
+ COPY requirements.txt .
19
+ RUN pip install --no-cache-dir --upgrade pip \
20
+ && pip install --no-cache-dir -r requirements.txt
21
+
22
+ # ---------- Application ----------
23
+ COPY main.py .
24
+
25
+ # Pre-download tiktoken encoding data so first request is instant
26
+ RUN python -c "import tiktoken; tiktoken.get_encoding('cl100k_base')"
27
+
28
+ # Switch to non-root
29
+ USER appuser
30
+
31
+ # ---------- Runtime ----------
32
+ # HF Spaces exposes exactly port 7860
33
+ EXPOSE 7860
34
+
35
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
main.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field
9
+
10
+ logger = logging.getLogger("promptzip")
11
+
12
+ # ── App ───────────────────────────────────────────────────────────────────────
13
+ app = FastAPI(
14
+ title="PromptZip API",
15
+ description="Semantic text compression via LLMlingua. Code and log compression run client-side.",
16
+ version="0.2.0",
17
+ )
18
+
19
+ # Permissive CORS β€” required for browser clients calling the HF Space
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_credentials=False, # must be False when allow_origins=["*"]
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ # ── Tokenizer (loaded once at startup) ───────────────────────────────────────
29
+ import tiktoken
30
+
31
+ _encoder = tiktoken.get_encoding("cl100k_base")
32
+ _COST_PER_MILLION: float = 5.00 # USD β€” GPT-4o standard input rate
33
+
34
+
35
+ def count_tokens(text: str) -> int:
36
+ """Exact token count via cl100k_base (GPT-4 / GPT-4o)."""
37
+ return len(_encoder.encode(text))
38
+
39
+
40
+ def estimate_cost(token_count: int) -> float:
41
+ """USD cost at $5.00 / 1 M tokens."""
42
+ return round((token_count / 1_000_000) * _COST_PER_MILLION, 6)
43
+
44
+
45
+ # ── LLMlingua (lazy-loaded so startup is never blocked) ──────────────────────
46
+ _llmlingua_compressor = None
47
+ _llmlingua_error: Optional[str] = None
48
+
49
+ # Aggression β†’ target retention ratio
50
+ _TEXT_RATIOS = {1: 0.8, 2: 0.6, 3: 0.4}
51
+
52
+
53
+ def _get_llmlingua():
54
+ """Return a cached PromptCompressor, or raise HTTP 503 if unavailable."""
55
+ global _llmlingua_compressor, _llmlingua_error
56
+ if _llmlingua_compressor is not None:
57
+ return _llmlingua_compressor
58
+ if _llmlingua_error is not None:
59
+ raise HTTPException(
60
+ status_code=503,
61
+ detail=f"LLMlingua unavailable: {_llmlingua_error}",
62
+ )
63
+ try:
64
+ from llmlingua import PromptCompressor
65
+ _llmlingua_compressor = PromptCompressor(
66
+ model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
67
+ use_llmlingua2=True,
68
+ device_map="cpu",
69
+ )
70
+ logger.info("LLMlingua initialised successfully.")
71
+ return _llmlingua_compressor
72
+ except Exception as exc:
73
+ _llmlingua_error = str(exc)
74
+ logger.error("LLMlingua init failed: %s", exc)
75
+ raise HTTPException(
76
+ status_code=503,
77
+ detail=f"LLMlingua unavailable: {exc}",
78
+ )
79
+
80
+
81
+ # ── Compression ───────────────────────────────────────────────────────────────
82
+
83
+ def compress_text(text: str, aggression: int) -> str:
84
+ """Semantic compression via LLMlingua PromptCompressor."""
85
+ compressor = _get_llmlingua()
86
+ ratio = _TEXT_RATIOS[aggression]
87
+ result = compressor.compress_prompt(
88
+ text,
89
+ rate=ratio,
90
+ force_tokens=["\n"],
91
+ drop_consecutive_whitespace=True,
92
+ )
93
+ return result.get("compressed_prompt", text)
94
+
95
+
96
+ # ── Schemas ───────────────────────────────────────────────────────────────────
97
+
98
+ class CompressRequest(BaseModel):
99
+ text: str = Field(..., description="The raw text to compress semantically.")
100
+ aggression_level: int = Field(
101
+ 2,
102
+ ge=1,
103
+ le=3,
104
+ description="1 = gentle (80% retained), 2 = balanced (60%), 3 = aggressive (40%).",
105
+ )
106
+
107
+
108
+ class CompressResponse(BaseModel):
109
+ compressed_text: str
110
+ original_tokens: int
111
+ new_tokens: int
112
+ tokens_saved: int
113
+ percent_saved: float
114
+ dollars_saved: float
115
+ aggression_level: int
116
+
117
+
118
+ # ── Endpoints ─────────────────────────────────────────────────────────────────
119
+
120
+ @app.get("/health", tags=["Health"])
121
+ async def health_check():
122
+ """Liveness probe β€” confirms the API is running."""
123
+ return {"status": "ok", "service": "promptzip-api", "version": "0.2.0"}
124
+
125
+
126
+ @app.post("/api/tokenize", tags=["Tokenizer"])
127
+ async def tokenize(body: dict):
128
+ """Count exact tokens for a text payload and return estimated cost."""
129
+ text = body.get("text", "")
130
+ tokens = count_tokens(text)
131
+ return {
132
+ "token_count": tokens,
133
+ "estimated_cost_usd": estimate_cost(tokens),
134
+ "encoding": "cl100k_base",
135
+ "rate_per_million_usd": _COST_PER_MILLION,
136
+ }
137
+
138
+
139
+ @app.post("/api/compress", response_model=CompressResponse, tags=["Compress"])
140
+ async def compress(body: CompressRequest):
141
+ """
142
+ Semantically compress **text** using LLMlingua.
143
+
144
+ - Code and log compression are handled client-side (regex) in the frontend.
145
+ - Only `mode=text` is served here.
146
+ """
147
+ if not body.text.strip():
148
+ raise HTTPException(status_code=400, detail="text must not be empty.")
149
+
150
+ compressed = compress_text(body.text, body.aggression_level)
151
+
152
+ original_tokens = count_tokens(body.text)
153
+ new_tokens = count_tokens(compressed)
154
+ saved = original_tokens - new_tokens
155
+ pct = round((saved / original_tokens) * 100, 2) if original_tokens else 0.0
156
+
157
+ return CompressResponse(
158
+ compressed_text=compressed,
159
+ original_tokens=original_tokens,
160
+ new_tokens=new_tokens,
161
+ tokens_saved=saved,
162
+ percent_saved=pct,
163
+ dollars_saved=round(
164
+ estimate_cost(original_tokens) - estimate_cost(new_tokens), 6
165
+ ),
166
+ aggression_level=body.aggression_level,
167
+ )
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn[standard]>=0.29.0
3
+ pydantic>=2.6.0
4
+ tiktoken>=0.6.0
5
+ llmlingua>=0.2.2