Spaces:

rustemgareev
/

mdeberta-ner-ontonotes5

Sleeping

App Files Files Community

rustemgareev commited on Jan 13

Commit

eb59cf9

1 Parent(s): f3e233f

Upload app files

Browse files

Files changed (4) hide show

Dockerfile +20 -0
app.py +282 -0
requirements.txt +6 -0
static/index.html +301 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.10-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    TRANSFORMERS_CACHE=/app/cache \
+    HF_HOME=/app/cache
+RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app.py .
+RUN useradd -m -u 1000 user
+USER user
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import logging
+from contextlib import asynccontextmanager
+from typing import List, Dict, Any
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Config
+class NERRequest(BaseModel):
+    text: str = Field(..., title="Input Text", description="Text to analyze")
+class NEREntity(BaseModel):
+    entity_group: str
+    score: float
+    word: str
+    start: int
+    end: int
+class NERResponse(BaseModel):
+    entities: List[NEREntity]
+# Constants
+SHORT_TEXT_THRESHOLD = 128
+MODEL_MAX_LENGTH = 512
+WINDOW_OVERLAP = 128
+# Core Logic
+def refine_boundaries(text: str, start: int, end: int) -> (int, int, str):
+    """
+    Adjusts start/end indices.
+    1. Expands selection to the end of the word if the model stopped mid-word.
+    2. Trims leading/trailing whitespace.
+    """
+    while end < len(text) and text[end].isalnum():
+        end += 1
+    # while end < len(text) and (text[end].isalnum() or text[end] == '-'):
+    #     end += 1
+    span = text[start:end]
+    # Shift start index forward if there is leading whitespace
+    while span and span[0].isspace():
+        start += 1
+        span = span[1:]
+    # Shift end index backward if there is trailing whitespace
+    while span and span[-1].isspace():
+        end -= 1
+        span = span[:-1]
+    return start, end, span
+def refine_boundaries1(text: str, start: int, end: int) -> (int, int, str):
+    """
+    Adjusts start/end indices to exclude leading/trailing whitespace.
+    This ensures the HTML highlight is tight around the word.
+    """
+    # Extract the raw span using original indices
+    span = text[start:end]
+    # Shift start index forward if there is leading whitespace
+    while span and span[0].isspace():
+        start += 1
+        span = span[1:]
+    # Shift end index backward if there is trailing whitespace
+    while span and span[-1].isspace():
+        end -= 1
+        span = span[:-1]
+    return start, end, span
+def save_current_entity(entity_parts: List[Dict], full_text: str, aggregated_entities: List[Dict]):
+    """
+    Finalizes a group of tokens into a single entity.
+    """
+    if not entity_parts:
+        return
+    # 1. Determine the raw range
+    raw_start = entity_parts[0]['start']
+    raw_end = entity_parts[-1]['end']
+    # 2. Refine boundaries (Trim spaces from indices)
+    final_start, final_end, clean_word = refine_boundaries(full_text, raw_start, raw_end)
+    if not clean_word:
+        return
+    # 3. Calculate score
+    avg_score = sum(part['score'] for part in entity_parts) / len(entity_parts)
+    # 4. Determine label (remove B/I prefix)
+    # We take the label from the first token usually, or the most frequent one
+    raw_label = entity_parts[0]['entity']
+    entity_group = raw_label.split('-')[-1] # e.g., "B-ORG" -> "ORG"
+    aggregated_entities.append({
+        'word': clean_word,
+        'score': float(avg_score),
+        'entity_group': entity_group,
+        'start': final_start,
+        'end': final_end
+    })
+def aggregate_entities_manual(ner_results: List[Dict], full_text: str) -> List[Dict]:
+    """
+    Aggregates subword tokens into whole entities.
+    Handles SentencePiece artifacts and BIO tagging.
+    """
+    if not ner_results:
+        return []
+    aggregated_entities = []
+    current_entity_parts = []
+    for entity in ner_results:
+        entity_label = entity['entity']
+        # Skip 'O' (Outside)
+        if entity_label == 'O':
+            if current_entity_parts:
+                save_current_entity(current_entity_parts, full_text, aggregated_entities)
+                current_entity_parts = []
+            continue
+        # Parse Label (e.g., "B-ORG", "I-ORG")
+        if '-' in entity_label:
+            prefix, label_type = entity_label.split('-', 1)
+        else:
+            prefix, label_type = None, entity_label
+        # Decision logic for merging
+        if not current_entity_parts:
+            # Start new entity
+            current_entity_parts.append(entity)
+        else:
+            prev_label = current_entity_parts[-1]['entity']
+            prev_type = prev_label.split('-')[-1] if '-' in prev_label else prev_label
+            # Merge condition:
+            # 1. Same Entity Type (ORG == ORG)
+            # 2. Adjacent indices (current start == prev end)
+            # 3. Logic: If it's "I-" tag, it MUST merge. If it's "B-" tag, it usually starts new,
+            #    BUT some models are messy. We prioritize adjacency + type match for smoother highlighting.
+            if label_type == prev_type and entity['start'] == current_entity_parts[-1]['end']:
+                current_entity_parts.append(entity)
+            else:
+                # Close previous and start new
+                save_current_entity(current_entity_parts, full_text, aggregated_entities)
+                current_entity_parts = [entity]
+    # Save tail
+    if current_entity_parts:
+        save_current_entity(current_entity_parts, full_text, aggregated_entities)
+    return aggregated_entities
+# Smart Processing Logic
+def process_text_smart(text: str, pipe, tokenizer) -> List[Dict]:
+    """
+    Hybrid strategy: Direct inference for short texts, Sliding Window for long ones.
+    Returns RAW tokens (unaggregated).
+    """
+    tokenized = tokenizer(
+        text,
+        return_offsets_mapping=True,
+        add_special_tokens=False,
+        verbose=False
+    )
+    offsets = tokenized["offset_mapping"]
+    total_tokens = len(offsets)
+    # STRATEGY A: Short Text
+    if total_tokens <= SHORT_TEXT_THRESHOLD:
+        return pipe(text)
+    # STRATEGY B: Sliding Window
+    all_raw_tokens = []
+    step = MODEL_MAX_LENGTH - WINDOW_OVERLAP
+    for start_idx in range(0, total_tokens, step):
+        end_idx = min(start_idx + MODEL_MAX_LENGTH, total_tokens)
+        char_start = offsets[start_idx][0]
+        char_end = offsets[end_idx - 1][1]
+        chunk_text = text[char_start:char_end]
+        if not chunk_text.strip():
+            continue
+        chunk_results = pipe(chunk_text)
+        for ent in chunk_results:
+            ent["start"] += char_start
+            ent["end"] += char_start
+            all_raw_tokens.append(ent)
+        if end_idx == total_tokens:
+            break
+    # Deduplicate raw tokens based on start index
+    all_raw_tokens.sort(key=lambda x: x['start'])
+    unique_tokens = []
+    seen_indices = set()
+    for t in all_raw_tokens:
+        idx_key = (t['start'], t['end'])
+        if idx_key not in seen_indices:
+            unique_tokens.append(t)
+            seen_indices.add(idx_key)
+    return unique_tokens
+# Lifespan
+ml_models: Dict[str, Any] = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    model_name = "rustemgareev/mdeberta-ner-ontonotes5"
+    logger.info(f"Loading model: {model_name}...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=MODEL_MAX_LENGTH)
+        model = AutoModelForTokenClassification.from_pretrained(model_name)
+        ner_pipe = pipeline(
+            "ner",
+            model=model,
+            tokenizer=tokenizer,
+            aggregation_strategy="none",
+            device=-1
+        )
+        ml_models["ner"] = ner_pipe
+        ml_models["tokenizer"] = tokenizer
+        logger.info("Model loaded.")
+    except Exception as e:
+        logger.error(f"CRITICAL ERROR loading model: {e}")
+    yield
+    ml_models.clear()
+# App Init
+app = FastAPI(title="mDeBERTa NER API", version="3.3.0", lifespan=lifespan)
+# API Endpoints
+@app.post("/predict", response_model=NERResponse)
+def predict(request: NERRequest):
+    if "ner" not in ml_models:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    if not request.text.strip():
+        return NERResponse(entities=[])
+    try:
+        # 1. Get Raw Tokens
+        raw_tokens = process_text_smart(
+            request.text,
+            ml_models["ner"],
+            ml_models["tokenizer"]
+        )
+        # 2. Aggressive Aggregation & Boundary Refinement
+        # We pass request.text to allow precise index trimming
+        aggregated = aggregate_entities_manual(raw_tokens, request.text)
+        return NERResponse(entities=[NEREntity(**item) for item in aggregated])
+    except Exception as e:
+        logger.error(f"Prediction error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Static Files
+app.mount("/", StaticFiles(directory="static", html=True), name="static")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi==0.128.0
+pydantic==2.12.5
+torch==2.9.1
+transformers==4.57.3
+uvicorn==0.40.0
+aiofiles

static/index.html ADDED Viewed

	@@ -0,0 +1,301 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>mdeberta-ner-ontonotes5</title>
+    <meta name="description" content="Named Entity Recognition Demo">
+    <style>
+        :root {
+            --c-misc: #f3e5f5;
+            --t-misc: #4a148c;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 40px 20px;
+            background-color: #fafafa;
+            color: #111;
+            line-height: 1.6;
+        }
+        .container {
+            background: white;
+            padding: 40px;
+            border-radius: 12px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.04);
+            border: 1px solid #eaeaea;
+        }
+        header {
+            margin-bottom: 30px;
+        }
+        h1 {
+            font-size: 24px;
+            font-weight: 600;
+            margin: 0 0 8px 0;
+            letter-spacing: -0.02em;
+        }
+        .subtitle {
+            color: #666;
+            font-size: 14px;
+        }
+        /* Input Section */
+        .input-group {
+            margin-bottom: 20px;
+            position: relative;
+        }
+        label {
+            display: block;
+            font-size: 12px;
+            font-weight: 600;
+            text-transform: uppercase;
+            color: #888;
+            margin-bottom: 8px;
+            letter-spacing: 0.05em;
+        }
+        textarea {
+            width: 100%;
+            min-height: 120px;
+            padding: 16px;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+            font-size: 16px;
+            line-height: 1.6;
+            resize: vertical;
+            background-color: #fff;
+            box-sizing: border-box;
+            font-family: inherit;
+            outline: none;
+            transition: border-color 0.2s, box-shadow 0.2s;
+            color: #333;
+        }
+        textarea:focus {
+            border-color: #000;
+            box-shadow: 0 0 0 2px rgba(0,0,0,0.05);
+        }
+        /* Button */
+        button.main-btn {
+            background-color: #1a1a1a;
+            color: white;
+            border: none;
+            padding: 14px 32px;
+            border-radius: 8px;
+            font-family: inherit;
+            font-size: 15px;
+            font-weight: 500;
+            cursor: pointer;
+            display: block;
+            width: 100%;
+            transition: background-color 0.2s, transform 0.1s;
+        }
+        button.main-btn:hover {
+            background-color: #333;
+        }
+        button.main-btn:active {
+            transform: scale(0.99);
+        }
+        button.main-btn:disabled {
+            background-color: #ccc;
+            cursor: not-allowed;
+            transform: none;
+        }
+        /* Output Section */
+        #output-wrapper {
+            margin-top: 30px;
+            display: none;
+            animation: fadeIn 0.3s ease-out;
+        }
+        .result-box {
+            padding: 20px;
+            border: 1px solid #eee;
+            background-color: #fcfcfc;
+            border-radius: 8px;
+            font-size: 16px;
+            line-height: 1.8;
+            white-space: pre-wrap;
+        }
+        /* Entity Styling */
+        .entity {
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: 500;
+            cursor: help;
+            position: relative;
+            transition: background-color 0.2s;
+            box-decoration-break: clone;
+            -webkit-box-decoration-break: clone;
+        }
+        /* Tooltip */
+        .entity::after {
+            content: attr(data-label) " " attr(data-score);
+            position: absolute;
+            bottom: 100%;
+            left: 50%;
+            transform: translateX(-50%) translateY(-4px);
+            background: #1a1a1a;
+            color: #fff;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 11px;
+            white-space: nowrap;
+            opacity: 0;
+            pointer-events: none;
+            transition: opacity 0.2s, transform 0.2s;
+            z-index: 10;
+            font-weight: 400;
+        }
+        .entity:hover::after {
+            opacity: 1;
+            transform: translateX(-50%) translateY(-8px);
+        }
+        .type-DEFAULT { background: var(--c-misc); color: var(--t-misc); }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .error-msg {
+            color: #d32f2f;
+            background: #ffebee;
+            padding: 12px;
+            border-radius: 6px;
+            margin-top: 20px;
+            font-size: 14px;
+            display: none;
+        }
+        @media (max-width: 600px) {
+            body { padding: 20px 10px; }
+            .container { padding: 24px; }
+        }
+    </style>
+</head>
+<body>
+<div class="container">
+    <header>
+        <h1>mdeberta-ner-ontonotes5</h1>
+        <div class="subtitle">Named Entity Recognition Demo</div>
+    </header>
+    <div class="input-group">
+        <label for="inputText">Input Text</label>
+        <textarea id="inputText" placeholder="Enter text to analyze...">Apple Inc. is looking at buying a U.K. startup for $1 billion in London next week.</textarea>
+    </div>
+    <button id="analyzeBtn" class="main-btn" onclick="analyze()">Analyze Text</button>
+    <div id="errorBox" class="error-msg"></div>
+    <div id="output-wrapper">
+        <label>Result</label>
+        <div id="resultBox" class="result-box"></div>
+    </div>
+</div>
+<script>
+    async function analyze() {
+        const input = document.getElementById('inputText');
+        const btn = document.getElementById('analyzeBtn');
+        const outputWrapper = document.getElementById('output-wrapper');
+        const resultBox = document.getElementById('resultBox');
+        const errorBox = document.getElementById('errorBox');
+        const text = input.value.trim();
+        if (!text) return;
+        // Reset UI
+        btn.disabled = true;
+        btn.textContent = "Processing...";
+        errorBox.style.display = 'none';
+        outputWrapper.style.display = 'none';
+        try {
+            const response = await fetch('/predict', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ text: text })
+            });
+            if (!response.ok) {
+                throw new Error(`Server Error: ${response.statusText}`);
+            }
+            const data = await response.json();
+            renderResult(text, data.entities);
+            // Show result
+            outputWrapper.style.display = 'block';
+        } catch (err) {
+            errorBox.textContent = err.message;
+            errorBox.style.display = 'block';
+        } finally {
+            btn.disabled = false;
+            btn.textContent = "Analyze Text";
+        }
+    }
+    function renderResult(originalText, entities) {
+        const resultBox = document.getElementById('resultBox');
+        resultBox.innerHTML = '';
+        if (!entities || entities.length === 0) {
+            resultBox.textContent = originalText;
+            return;
+        }
+        let lastIndex = 0;
+        entities.forEach(entity => {
+            // 1. Text before entity
+            const plainText = originalText.slice(lastIndex, entity.start);
+            resultBox.appendChild(document.createTextNode(plainText));
+            // 2. Entity
+            const span = document.createElement('span');
+            // Determine class based on entity group (fallback to DEFAULT)
+            const type = entity.entity_group || 'DEFAULT';
+            // Check if specific class exists in CSS is hard in JS, so we rely on CSS fallbacks or generic logic
+            // Here we map common groups to classes, others fall to DEFAULT via CSS if not defined
+            span.className = `entity type-${type} type-DEFAULT`;
+            span.textContent = originalText.slice(entity.start, entity.end);
+            // Tooltip data
+            span.setAttribute('data-label', type);
+            span.setAttribute('data-score', Math.round(entity.score * 100) + '%');
+            resultBox.appendChild(span);
+            lastIndex = entity.end;
+        });
+        // 3. Remaining text
+        resultBox.appendChild(document.createTextNode(originalText.slice(lastIndex)));
+    }
+</script>
+</body>
+</html>