Spaces:

slamos
/

bc-test

Paused

App Files Files Community

lamossta commited on Apr 3

Commit

ffcf8df

1 Parent(s): 24c3bcf

api and pages

Browse files

Files changed (11) hide show

app.py +20 -0
pages/config.py +43 -0
pages/home.py +34 -0
src/api/fix_newlines.py +13 -0
src/api/fix_newlines_all_models.py +17 -0
src/api/health.py +24 -0
src/datasets/build_pairs.py +135 -0
src/datasets/create_recipes_dataset.py +96 -0
src/fe_handler.py +26 -0
src/models/inference.py +188 -0
tests/conftest.py +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Streamlit UI entry point for the Newline Fixer service."""
+import streamlit as st
+st.set_page_config(initial_sidebar_state="collapsed")
+st.markdown(
+    """
+    <style>
+        [data-testid="collapsedControl"] { display: none; }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+home = st.Page("pages/home.py", title="Home", default=True)
+config = st.Page("pages/config.py", title="Config")
+result = st.Page("pages/result.py", title="Result")
+pg = st.navigation([home, config, result], position="hidden")
+pg.run()

pages/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+from src.fe_handler import fix_newlines, fix_newlines_all_models
+from pages.nav import show_stepper
+show_stepper("Config")
+st.title("Configure request")
+if st.button("← Back"):
+    st.switch_page("pages/home.py")
+text = st.text_area(
+    "Paste your text here:",
+    value=st.session_state.get("original_text", ""),
+    height=300,
+    key="input_text",
+)
+endpoint = st.radio(
+    "Select endpoint:",
+    ["fix-newlines", "fix-newlines-all-models"],
+    key="endpoint",
+    help="**fix-newlines**: single model (distilbert). "
+         "**fix-newlines-all-models**: all models side by side.",
+)
+if st.button("Submit"):
+    if not text.strip():
+        st.warning("Please enter some text.")
+    else:
+        try:
+            if endpoint == "fix-newlines":
+                result = fix_newlines(text)
+            else:
+                result = fix_newlines_all_models(text)
+            st.session_state["original_text"] = text
+            st.session_state["selected_endpoint"] = endpoint
+            st.session_state["result"] = result
+            st.switch_page("pages/result.py")
+        except Exception as e:
+            st.error(f"Request failed: {e}")

pages/home.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from src.fe_handler import health
+from pages.nav import show_stepper
+show_stepper("Home")
+st.title("Newline Fixer")
+st.write(
+    "An ML service for fixing newline placement in English text. "
+    "Paste your text, pick an endpoint, and get properly formatted output."
+)
+st.subheader("Available endpoints")
+st.markdown(
+    "**`/fix-newlines`** — Runs your text through a single model (distilbert). "
+    "Returns the fixed text with corrected newline placement."
+)
+st.markdown(
+    "**`/fix-newlines-all-models`** — Runs your text through all available models "
+    "(distilbert, bert, deberta) and returns the results from each, "
+    "so you can compare their outputs side by side."
+)
+try:
+    h = health()
+    st.success(f"API is running. Available models: {', '.join(h['available_models'])}")
+except Exception:
+    st.error("API is not reachable. Make sure the server is running on localhost:8000.")
+if st.button("Next →"):
+    st.switch_page("pages/config.py")

src/api/fix_newlines.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from fastapi import APIRouter, HTTPException, Request
+from src.schemas.requests import FixNewlinesRequest
+from src.schemas.responses import FixNewlinesResponse
+router = APIRouter()
+@router.post("/fix-newlines", response_model=FixNewlinesResponse)
+def fix_newlines(request: Request, body: FixNewlinesRequest):
+    pipeline = request.app.state.one_model_pipeline
+    fixed = pipeline.predict(body.text)
+    return FixNewlinesResponse(fixed_text=fixed, model_used=pipeline.model_name)

src/api/fix_newlines_all_models.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from fastapi import APIRouter, HTTPException, Request
+from src.schemas.requests import FixNewlinesAllModelsRequest
+from src.schemas.responses import FixNewlinesAllModelsResponse, ModelResult
+router = APIRouter()
+@router.post("/fix-newlines-all-models", response_model=FixNewlinesAllModelsResponse)
+def fix_newlines_all_models(request: Request, body: FixNewlinesAllModelsRequest):
+    pipeline = request.app.state.all_models_pipeline
+    results_dict = pipeline.predict(body.text)
+    results = [
+        ModelResult(model_name=name, fixed_text=text)
+        for name, text in results_dict.items()
+    ]
+    return FixNewlinesAllModelsResponse(results=results)

src/api/health.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import APIRouter, Request
+from src.schemas.responses import HealthResponse
+router = APIRouter()
+@router.get("/health", response_model=HealthResponse)
+def health(request: Request):
+    models = []
+    if request.app.state.one_model_pipeline is not None:
+        models.append(request.app.state.one_model_pipeline.model_name)
+    models.extend(request.app.state.all_models_pipeline.model_names)
+    seen = set()
+    unique = []
+    for m in models:
+        if m not in seen:
+            seen.add(m)
+            unique.append(m)
+    return HealthResponse(
+        status="ok",
+        available_models=unique,
+    )

src/datasets/build_pairs.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Build sentence pairs from sentence-labeled JSONL files.
+For each document with sentences [s1, s2, s3, ...] and labels [l1, l2, l3, ...],
+produce pairs: (s1, s2, l1), (s2, s3, l2), ..., (s_{n-1}, s_n, l_{n-1}).
+The label describes the boundary between the two sentences in each pair.
+"""
+import json
+from pathlib import Path
+from tqdm import tqdm
+def _build_pairs_from_records(
+    records: list[dict],
+    id_field: str,
+    desc: str,
+) -> list[dict]:
+    """Convert sentence-level records into pair-level records."""
+    pairs: list[dict] = []
+    for record in tqdm(records, desc=desc):
+        sentences = record["sentences"]
+        labels = record["labels"]
+        doc_id = record.get(id_field, "")
+        for i in range(len(sentences) - 1):
+            pairs.append({
+                id_field: doc_id,
+                "sentence1": sentences[i],
+                "sentence2": sentences[i + 1],
+                "label": labels[i],
+            })
+    return pairs
+def _load_jsonl(path: Path) -> list[dict]:
+    records = []
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                records.append(json.loads(line))
+    return records
+def _write_jsonl(pairs: list[dict], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for pair in pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+    print(f"Wrote {len(pairs):,} pairs → {path}")
+def build_pubmed_pairs(
+    input_path: str | Path = "data/pubmed/pubmed_sentences.jsonl",
+    output_path: str | Path = "data/pubmed/pubmed_pairs.jsonl",
+) -> None:
+    records = _load_jsonl(Path(input_path))
+    pairs = _build_pairs_from_records(records, "document_idx", "Building PubMed pairs")
+    _write_jsonl(pairs, Path(output_path))
+def build_wikipedia_pairs(
+    input_path: str | Path = "data/wikipedia/wikipedia_sentences.jsonl",
+    output_path: str | Path = "data/wikipedia/wikipedia_pairs.jsonl",
+) -> None:
+    records = _load_jsonl(Path(input_path))
+    pairs = _build_pairs_from_records(records, "document_idx", "Building Wikipedia pairs")
+    _write_jsonl(pairs, Path(output_path))
+def build_gutenberg_pairs(
+    input_path: str | Path = "data/gutenberg/gutenberg_sentences.jsonl",
+    output_path: str | Path = "data/gutenberg/gutenberg_pairs.jsonl",
+) -> None:
+    records = _load_jsonl(Path(input_path))
+    pairs = _build_pairs_from_records(records, "file_name", "Building Gutenberg pairs")
+    _write_jsonl(pairs, Path(output_path))
+def build_recipes_pairs(
+    input_path: str | Path = "data/recipes/recipes_sentences.jsonl",
+    output_path: str | Path = "data/recipes/recipes_pairs.jsonl",
+) -> None:
+    records = _load_jsonl(Path(input_path))
+    pairs = _build_pairs_from_records(records, "document_idx", "Building recipes pairs")
+    _write_jsonl(pairs, Path(output_path))
+def build_all_pairs() -> None:
+    build_pubmed_pairs()
+    build_wikipedia_pairs()
+    build_gutenberg_pairs()
+    build_recipes_pairs()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Build sentence pairs from sentence-labeled JSONL files.")
+    sub = parser.add_subparsers(dest="dataset")
+    pub = sub.add_parser("pubmed", help="Build PubMed pairs")
+    pub.add_argument("--input", default="data/pubmed/pubmed_sentences.jsonl")
+    pub.add_argument("--output", default="data/pubmed/pubmed_pairs.jsonl")
+    wiki = sub.add_parser("wikipedia", help="Build Wikipedia pairs")
+    wiki.add_argument("--input", default="data/wikipedia/wikipedia_sentences.jsonl")
+    wiki.add_argument("--output", default="data/wikipedia/wikipedia_pairs.jsonl")
+    gut = sub.add_parser("gutenberg", help="Build Gutenberg pairs")
+    gut.add_argument("--input", default="data/gutenberg/gutenberg_sentences.jsonl")
+    gut.add_argument("--output", default="data/gutenberg/gutenberg_pairs.jsonl")
+    rec = sub.add_parser("recipes", help="Build recipes pairs")
+    rec.add_argument("--input", default="data/recipes/recipes_sentences.jsonl")
+    rec.add_argument("--output", default="data/recipes/recipes_pairs.jsonl")
+    all_p = sub.add_parser("all", help="Build pairs for all datasets")
+    args = parser.parse_args()
+    if args.dataset == "pubmed":
+        build_pubmed_pairs(args.input, args.output)
+    elif args.dataset == "wikipedia":
+        build_wikipedia_pairs(args.input, args.output)
+    elif args.dataset == "gutenberg":
+        build_gutenberg_pairs(args.input, args.output)
+    elif args.dataset == "recipes":
+        build_recipes_pairs(args.input, args.output)
+    elif args.dataset == "all":
+        build_all_pairs()
+    else:
+        parser.print_help()

src/datasets/create_recipes_dataset.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Create a recipes dataset from RecipeNLG full_dataset.csv.
+Randomly samples recipes, formats each as a structured document, and writes
+a JSONL file with fields: document_idx, text.
+Usage:
+    python -m src.datasets.create_recipes_dataset
+    python -m src.datasets.create_recipes_dataset --n_samples 500 --seed 42
+"""
+import argparse
+import ast
+import json
+import random
+from pathlib import Path
+import pandas as pd
+def format_recipe_as_document(rec: dict) -> str:
+    """Turn a raw recipe CSV row into a formatted document.
+    Structure:
+        Title\\n\\n
+        Ingredients:\\n
+        - ingredient 1\\n
+        - ingredient 2\\n\\n
+        Directions:\\n
+        1. step one\\n
+        2. step two
+    """
+    title = rec["title"].strip()
+    ingredients = rec["ingredients"]
+    if isinstance(ingredients, str):
+        ingredients = ast.literal_eval(ingredients)
+    directions = rec["directions"]
+    if isinstance(directions, str):
+        directions = ast.literal_eval(directions)
+    bullet = random.choice(["- ", "• "])
+    num_style = random.choice(["dot", "paren"])
+    ing_lines = [f"{bullet}{ing.strip()}" for ing in ingredients if ing.strip()]
+    if num_style == "dot":
+        dir_lines = [f"{i+1}. {d.strip()}" for i, d in enumerate(directions) if d.strip()]
+    else:
+        dir_lines = [f"{i+1}) {d.strip()}" for i, d in enumerate(directions) if d.strip()]
+    parts = [
+        title,
+        "",
+        "Ingredients:",
+        *ing_lines,
+        "",
+        "Directions:",
+        *dir_lines,
+    ]
+    return "\n".join(parts)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create recipes dataset from RecipeNLG CSV.")
+    parser.add_argument("--csv_path", type=str, default="data/recipes/full_dataset.csv")
+    parser.add_argument("--output", type=str, default="data/recipes/recipes_data.jsonl")
+    parser.add_argument("--n_samples", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    csv_path = Path(args.csv_path)
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    # Load CSV with pandas
+    df = pd.read_csv(csv_path)
+    print(f"Total recipes in CSV: {len(df):,}")
+    # Sample random rows
+    df_sample = df.sample(n=min(args.n_samples, len(df)), random_state=args.seed)
+    recipes = df_sample.to_dict(orient="records")
+    print(f"Sampled {len(recipes)} recipes")
+    # Format and write JSONL
+    with open(out_path, "w", encoding="utf-8") as f:
+        for doc_idx, rec in enumerate(recipes):
+            text = format_recipe_as_document(rec)
+            record = {"document_idx": doc_idx, "text": text}
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    print(f"Wrote {len(recipes)} documents -> {out_path}")
+if __name__ == "__main__":
+    main()

src/fe_handler.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Frontend handler — bridges the Streamlit UI with the FastAPI backend."""
+import requests
+BASE_URL = "http://localhost:8000"
+def fix_newlines(text: str, model_name: str | None = None) -> dict:
+    payload = {"text": text}
+    if model_name:
+        payload["model_name"] = model_name
+    resp = requests.post(f"{BASE_URL}/fix-newlines", json=payload)
+    resp.raise_for_status()
+    return resp.json()
+def fix_newlines_all_models(text: str) -> dict:
+    resp = requests.post(f"{BASE_URL}/fix-newlines-all-models", json={"text": text})
+    resp.raise_for_status()
+    return resp.json()
+def health() -> dict:
+    resp = requests.get(f"{BASE_URL}/health")
+    resp.raise_for_status()
+    return resp.json()

src/models/inference.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Interactive CLI for paragraph-boundary inference using ONNX models.
+Downloads pre-trained ONNX models from Hugging Face Hub (if not cached),
+loads SAT-12L for sentence splitting, then enters an interactive loop:
+paste text, get boundary predictions.
+Usage:
+    python -m src.models.inference
+    python -m src.models.inference --model distilbert
+    python -m src.models.inference --model bert
+"""
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoTokenizer
+from src.datasets.combined_pairs_dataset import ID2LABEL
+from src.pipelines.sat_loader import load_sat
+from src.models.export_and_download import HF_MODELS, download_model
+logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)s  %(message)s")
+log = logging.getLogger(__name__)
+LABEL_SYMBOLS = {
+    "SAME_PARAGRAPH": " ",
+    "NEW_PARAGRAPH": "\n\n",
+    "NEWLINE": "\n",
+}
+LOCAL_CHECKPOINTS = Path("checkpoints")
+def _load_onnx_model(model_name: str, local: bool = False):
+    """Load an ONNX session + tokenizer from local checkpoints or HF Hub."""
+    if local:
+        model_dir = LOCAL_CHECKPOINTS / model_name / "best"
+    else:
+        repo_id = HF_MODELS[model_name]
+        model_dir = download_model(repo_id)
+    onnx_path = model_dir / "model.onnx"
+    if not onnx_path.exists():
+        raise FileNotFoundError(f"No model.onnx found in {model_dir}")
+    session = ort.InferenceSession(
+        str(onnx_path),
+        providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+    )
+    input_names = [inp.name for inp in session.get_inputs()]
+    tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
+    return session, tokenizer, input_names
+def _predict_pairs(session, tokenizer, input_names, sentences: list[str], max_length: int = 512) -> list[dict]:
+    """Classify boundary between each consecutive sentence pair via ONNX."""
+    if len(sentences) < 2:
+        return []
+    results = []
+    for i in range(len(sentences) - 1):
+        enc = tokenizer(
+            sentences[i],
+            sentences[i + 1],
+            truncation=True,
+            max_length=max_length,
+            padding="max_length",
+            return_tensors="np",
+        )
+        feeds = {k: enc[k] for k in input_names if k in enc}
+        logits = session.run(None, feeds)[0]
+        probs = _softmax(logits[0])
+        pred = int(np.argmax(probs))
+        results.append({
+            "sentence1": sentences[i],
+            "sentence2": sentences[i + 1],
+            "label": ID2LABEL[pred],
+            "confidence": round(float(probs[pred]), 4),
+        })
+    return results
+def _softmax(x: np.ndarray) -> np.ndarray:
+    e = np.exp(x - np.max(x))
+    return e / e.sum()
+def _reconstruct(sentences: list[str], predictions: list[dict]) -> str:
+    """Rebuild text from sentences and predicted boundaries."""
+    if not sentences:
+        return ""
+    parts = [sentences[0]]
+    for i, pred in enumerate(predictions):
+        sep = LABEL_SYMBOLS.get(pred["label"], " ")
+        parts.append(sep + sentences[i + 1])
+    return "".join(parts)
+def _read_multiline() -> str | None:
+    """Read multi-line input until an empty line is entered."""
+    print("Paste your text (empty line to submit, 'quit' to exit):")
+    lines = []
+    while True:
+        try:
+            line = input()
+        except EOFError:
+            return None
+        if line.strip().lower() == "quit":
+            return None
+        if line == "" and lines:
+            break
+        lines.append(line)
+    return "\n".join(lines)
+def interactive_loop(model_name: str, max_length: int = 512, local: bool = False) -> None:
+    source = "local checkpoints" if local else "HuggingFace Hub"
+    log.info(f"Loading ONNX model '{model_name}' from {source} ...")
+    session, tokenizer, input_names = _load_onnx_model(model_name, local=local)
+    log.info("Loading SAT-12L ...")
+    sat = load_sat()
+    print("\n" + "=" * 60)
+    print(f"  Paragraph Boundary Inference  [{model_name} / ONNX]")
+    print("=" * 60 + "\n")
+    while True:
+        text = _read_multiline()
+        if text is None:
+            print("Bye.")
+            break
+        if not text.strip():
+            print("(empty input, skipping)\n")
+            continue
+        # 1. Sentence-split with SAT first, then strip newlines from each sentence
+        sentences = sat.split(text, split_on_input_newlines=False, strip_whitespace=False)
+        sentences = [s.replace('\n', '').strip() for s in sentences if s.strip()]
+        print(f"\n--- {len(sentences)} sentence(s) detected ---")
+        if len(sentences) < 2:
+            print(f"  {sentences[0] if sentences else '(none)'}")
+            print("  (need at least 2 sentences to classify boundaries)\n")
+            continue
+        # 3. Predict boundaries
+        predictions = _predict_pairs(session, tokenizer, input_names, sentences, max_length)
+        # 4. Show per-pair results
+        for i, pred in enumerate(predictions):
+            print(f"  [{i+1}] {pred['label']:16s} ({pred['confidence']:.2%})")
+            print(f"       S1: {pred['sentence1'][:80]}")
+            print(f"       S2: {pred['sentence2'][:80]}")
+        # 5. Show reconstructed text
+        reconstructed = _reconstruct(sentences, predictions)
+        print("\n--- Reconstructed text ---")
+        print(reconstructed)
+        print()
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Interactive paragraph-boundary inference (ONNX).")
+    parser.add_argument(
+        "--model",
+        default="distilbert",
+        choices=list(HF_MODELS.keys()),
+        help="Which model to use (default: distilbert)",
+    )
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--local", action="store_true",
+                        help="Load from checkpoints/<model>/best instead of HF Hub")
+    args = parser.parse_args()
+    interactive_loop(args.model, args.max_length, local=args.local)
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import pytest
+from fastapi.testclient import TestClient
+from main import app
+@pytest.fixture(scope="session")
+def client():
+    """TestClient that runs the full app lifespan (loads SAT + ONNX models once)."""
+    with TestClient(app) as c:
+        yield c