Spaces:

student2222333051
/

summarizer_space

Sleeping

App Files Files Community

app

by student2222333051 - opened Nov 26, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-497

This PR is in draft mode

Files changed (9) hide show

Dockerfile +0 -26
fine_tune.py +0 -57
index.html +0 -33
main.py +0 -70
pdf_reader.py +0 -47
requirements.txt +0 -8
script.js +0 -88
styles.css +0 -69
summarizer.py +0 -99

Dockerfile DELETED Viewed

@@ -1,26 +0,0 @@
-# Dockerfile
-FROM python:3.10-slim
-# System deps for PDF processing (optional)
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    poppler-utils \
-    && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-COPY . .
-# Install python deps
-RUN pip install --no-cache-dir -r requirements.txt
-# Экстра requirements (кейде қажет)
-RUN pip install --no-cache-dir torch transformers PyPDF2 pdfminer.six
-ENV PYTHONUNBUFFERED=1
-# Осы жерде модель атауын орнатуға болады:
-# ENV FINE_TUNED_MODEL=username/bart-finetuned-arxiv
-EXPOSE 7860
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

fine_tune.py DELETED Viewed

@@ -1,57 +0,0 @@
-# fine_tune.py
-from datasets import load_dataset
-from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
-import os
-model_name = "facebook/bart-large-cnn"
-tokenizer = BartTokenizer.from_pretrained(model_name)
-model = BartForConditionalGeneration.from_pretrained(model_name)
-dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
-# Күнделікті тест үшін шағын subset
-dataset["train"] = dataset["train"].select(range(1000))
-dataset["validation"] = dataset["validation"].select(range(200))
-max_input_length = 1024
-max_output_length = 200
-def preprocess_function(batch):
-    inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
-    outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
-    batch["input_ids"] = inputs["input_ids"]
-    batch["attention_mask"] = inputs["attention_mask"]
-    batch["labels"] = outputs["input_ids"]
-    return batch
-tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
-tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
-training_args = TrainingArguments(
-    output_dir="./bart-finetuned-arxiv-hub",
-    evaluation_strategy="steps",
-    eval_steps=500,
-    save_steps=500,
-    save_total_limit=2,
-    learning_rate=3e-5,
-    per_device_train_batch_size=2,
-    per_device_eval_batch_size=2,
-    num_train_epochs=3,
-    weight_decay=0.01,
-    fp16=False,  # GPU болса True қой
-    logging_dir="./logs",
-    logging_steps=100,
-    push_to_hub=True
-)
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_train,
-    eval_dataset=tokenized_val,
-    tokenizer=tokenizer,
-)
-trainer.train()
-trainer.push_to_hub("username/bart-finetuned-arxiv")
-print("Fine-tuning complete.")

index.html DELETED Viewed

@@ -1,33 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8" />
-    <title>Scientific Article Summarizer</title>
-    <link rel="stylesheet" href="/static/styles.css" />
-</head>
-<body>
-    <h1>Scientific Article Summarizer</h1>
-    <div class="container">
-        <h2>Summarize Text</h2>
-        <textarea id="inputText" placeholder="Insert scientific text..."></textarea>
-        <div class="row">
-            <button id="summTextBtn" onclick="summarizeText()">Summarize Text</button>
-            <button onclick="copySummary()">Copy Summary</button>
-        </div>
-        <h2>Summarize PDF</h2>
-        <input type="file" id="pdfFile" accept="application/pdf" />
-        <div class="row">
-            <button id="summPdfBtn" onclick="summarizePDF()">Summarize PDF</button>
-        </div>
-        <h2>Summary Result</h2>
-        <div id="summaryBox" style="white-space: pre-line;"></div>
-        <div id="errorBox" class="error" hidden></div>
-    </div>
-    <script src="/static/script.js"></script>
-</body>
-</html>

main.py DELETED Viewed

@@ -1,70 +0,0 @@
-# main.py
-import os
-import logging
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse, JSONResponse
-from fastapi.staticfiles import StaticFiles
-from starlette.status import HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
-from summarizer import generate_summary
-from pdf_reader import extract_text_from_pdf
-# Логтау
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("summarizer_app")
-app = FastAPI(title="Scientific Article Summarizer")
-# CORS (frontend ↔ backend)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Статикалық файлдар — жобаның түбірінен береді
-app.mount("/static", StaticFiles(directory="."), name="static")
-@app.get("/")
-async def root():
-    return FileResponse("index.html")
-@app.post("/summarize/text")
-async def summarize_text(data: dict):
-    text = data.get("text", "")
-    if not text or not text.strip():
-        raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="No text provided.")
-    try:
-        summary = generate_summary(text)
-        return {"summary": summary}
-    except Exception as e:
-        logger.exception("Error while generating text summary")
-        raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
-MAX_PDF_BYTES = int(os.getenv("MAX_PDF_BYTES", 10 * 1024 * 1024))  # 10 MB default
-@app.post("/summarize/pdf")
-async def summarize_pdf(file: UploadFile = File(...)):
-    if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Only PDF files are supported.")
-    pdf_bytes = await file.read()
-    if len(pdf_bytes) == 0:
-        raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Uploaded PDF is empty.")
-    if len(pdf_bytes) > MAX_PDF_BYTES:
-        raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail=f"PDF too large (max {MAX_PDF_BYTES} bytes).")
-    try:
-        text = extract_text_from_pdf(pdf_bytes)
-        if not text or not text.strip():
-            return {"summary": "PDF is empty or could not be processed."}
-        summary = generate_summary(text)
-        return {"summary": summary}
-    except Exception as e:
-        logger.exception("Error while processing PDF")
-        raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
-# Custom error handler example
-@app.exception_handler(HTTPException)
-async def http_exception_handler(request, exc: HTTPException):
-    return JSONResponse(status_code=exc.status_code, content={"error": exc.detail})

pdf_reader.py DELETED Viewed

@@ -1,47 +0,0 @@
-# pdf_reader.py
-import io
-from typing import Optional
-# Бірнеше кітапхана арқылы оқуды қолдаймыз — ең алдымен PyPDF2, егер қажет болса pdfminer.six
-try:
-    from PyPDF2 import PdfReader
-    _has_pypdf2 = True
-except Exception:
-    _has_pypdf2 = False
-try:
-    from pdfminer.high_level import extract_text as pdfminer_extract_text
-    _has_pdfminer = True
-except Exception:
-    _has_pdfminer = False
-def extract_text_from_pdf(pdf_bytes: bytes) -> str:
-    """
-    PDF-тен мәтін алу. Бірнеше әдісті қолданып көреді.
-    """
-    # 1) PyPDF2
-    if _has_pypdf2:
-        try:
-            reader = PdfReader(io.BytesIO(pdf_bytes))
-            text = ""
-            for page in reader.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-            if text.strip():
-                return text
-        except Exception:
-            pass
-    # 2) pdfminer.six (әдетте күрделі PDF-тарға жақсы)
-    if _has_pdfminer:
-        try:
-            text = pdfminer_extract_text(io.BytesIO(pdf_bytes))
-            if text and text.strip():
-                return text
-        except Exception:
-            pass
-    # Егер ештеңе шықпаса — бос қайтарың
-    return ""

requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-fastapi
-uvicorn[standard]
-transformers
-torch
-PyPDF2
-python-multipart
-pdfminer.six

script.js DELETED Viewed

@@ -1,88 +0,0 @@
-// script.js
-async function summarizeText() {
-    clearMessages();
-    const text = document.getElementById("inputText").value;
-    const btn = document.getElementById("summTextBtn");
-    btn.disabled = true;
-    btn.innerText = "Summarizing...";
-    try {
-        const response = await fetch("/summarize/text", {
-            method: "POST",
-            headers: {"Content-Type": "application/json"},
-            body: JSON.stringify({ text })
-        });
-        if (!response.ok) {
-            const err = await response.json();
-            showError(err.detail || "Error during summarization.");
-            return;
-        }
-        const data = await response.json();
-        document.getElementById("summaryBox").innerText = data.summary;
-    } catch (e) {
-        showError(e.message || "Network error.");
-    } finally {
-        btn.disabled = false;
-        btn.innerText = "Summarize Text";
-    }
-}
-async function summarizePDF() {
-    clearMessages();
-    const fileInput = document.getElementById("pdfFile");
-    if (!fileInput.files.length) {
-        showError("Please select a PDF file first.");
-        return;
-    }
-    const file = fileInput.files[0];
-    const formData = new FormData();
-    formData.append("file", file);
-    const btn = document.getElementById("summPdfBtn");
-    btn.disabled = true;
-    btn.innerText = "Summarizing...";
-    try {
-        const response = await fetch("/summarize/pdf", {
-            method: "POST",
-            body: formData
-        });
-        if (!response.ok) {
-            const err = await response.json();
-            showError(err.detail || "Error during PDF summarization.");
-            return;
-        }
-        const data = await response.json();
-        document.getElementById("summaryBox").innerText = data.summary;
-    } catch (e) {
-        showError(e.message || "Network error.");
-    } finally {
-        btn.disabled = false;
-        btn.innerText = "Summarize PDF";
-    }
-}
-function showError(msg) {
-    const errBox = document.getElementById("errorBox");
-    errBox.hidden = false;
-    errBox.innerText = msg;
-}
-function clearMessages() {
-    document.getElementById("summaryBox").innerText = "";
-    const errBox = document.getElementById("errorBox");
-    errBox.hidden = true;
-    errBox.innerText = "";
-}
-function copySummary() {
-    const text = document.getElementById("summaryBox").innerText;
-    if (!text) {
-        alert("There's no summary to copy.");
-        return;
-    }
-    navigator.clipboard.writeText(text).then(() => {
-        alert("Summary copied to clipboard.");
-    }).catch(() => {
-        alert("Failed to copy summary.");
-    });
-}

styles.css DELETED Viewed

@@ -1,69 +0,0 @@
-/* styles.css */
-body {
-    font-family: Arial, sans-serif;
-    background: #f1f1f1;
-    padding: 20px;
-}
-h1 {
-    text-align: center;
-}
-.container {
-    max-width: 900px;
-    margin: auto;
-    background: white;
-    padding: 20px;
-    border-radius: 10px;
-    box-shadow: 0 4px 12px rgba(0,0,0,0.06);
-}
-textarea {
-    width: 100%;
-    height: 180px;
-    padding: 10px;
-    border-radius: 6px;
-    border: 1px solid #ddd;
-    resize: vertical;
-}
-button {
-    padding: 10px 20px;
-    margin-top: 10px;
-    cursor: pointer;
-    border-radius: 6px;
-    border: none;
-    background: #2f6fed;
-    color: white;
-    font-weight: 600;
-}
-button:disabled {
-    opacity: 0.6;
-    cursor: not-allowed;
-}
-.row {
-    display: flex;
-    gap: 10px;
-    align-items: center;
-    margin-top: 8px;
-}
-#summaryBox {
-    background: #fafafa;
-    padding: 15px;
-    border-radius: 5px;
-    min-height: 120px;
-    border: 1px solid #eee;
-    margin-top: 10px;
-}
-.error {
-    margin-top: 12px;
-    color: #a94442;
-    background: #f2dede;
-    padding: 10px;
-    border-radius: 6px;
-    border: 1px solid #ebccd1;
-}

summarizer.py DELETED Viewed

@@ -1,99 +0,0 @@
-# summarizer.py
-import os
-import math
-import torch
-from transformers import BartTokenizer, BartForConditionalGeneration
-# Конфигурация: fine-tuned модель атауы немесе default
-MODEL_NAME = os.getenv("FINE_TUNED_MODEL", "facebook/bart-large-cnn")
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Инициализация (бір рет)
-tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
-model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
-model.eval()
-# Параметрлер
-MAX_INPUT_LENGTH = 1024
-SUMMARY_MIN_LENGTH = 40
-SUMMARY_MAX_LENGTH = 200
-NUM_BEAMS = 4
-def chunk_text(text: str, max_tokens: int = MAX_INPUT_LENGTH, overlap: int = 50):
-    """
-    Ұзын мәтінді токендер бойынша бөліп қайтару. overlap — әр кусок арасында қайталанатын токен саны.
-    """
-    inputs = tokenizer(text, return_tensors="pt", truncation=False)
-    input_ids = inputs["input_ids"][0].tolist()
-    chunks = []
-    start = 0
-    while start < len(input_ids):
-        end = start + max_tokens
-        chunk_ids = input_ids[start:end]
-        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        chunks.append(chunk_text)
-        if end >= len(input_ids):
-            break
-        start = end - overlap
-    return chunks
-def generate_summary(text: str) -> str:
-    """
-    Егер мәтін MAX_INPUT_LENGTH-тен ұзын болса — бөліп, әр бөліктің summary алып,
-    содан кейін қысқа unified summary қайтару.
-    """
-    text = text.strip()
-    if not text:
-        return ""
-    # Егер қысқа — тікелей summary
-    tokens = tokenizer(text, max_length=1, truncation=False)
-    # Қарапайым жүктеме: егер мәтін қысқа — бір шақыру
-    if len(tokenizer.encode(text)) <= MAX_INPUT_LENGTH:
-        inputs = tokenizer([text], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            summary_ids = model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs.get("attention_mask", None),
-                num_beams=NUM_BEAMS,
-                min_length=SUMMARY_MIN_LENGTH,
-                max_length=SUMMARY_MAX_LENGTH,
-                early_stopping=True,
-                no_repeat_ngram_size=3
-            )
-        return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-    # Ұзын мәтін: бөліп, әр бөлімнің summary алып, содан кейін агрегаттау
-    chunks = chunk_text(text, max_tokens=MAX_INPUT_LENGTH, overlap=64)
-    partial_summaries = []
-    for chunk in chunks:
-        inputs = tokenizer([chunk], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            summary_ids = model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs.get("attention_mask", None),
-                num_beams=NUM_BEAMS,
-                min_length=SUMMARY_MIN_LENGTH // 2,
-                max_length=SUMMARY_MAX_LENGTH,
-                early_stopping=True,
-                no_repeat_ngram_size=3
-            )
-        s = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        partial_summaries.append(s)
-    # Біріктіру: partial_summaries-тан соңғы қысқаша summary жасау
-    combined = "\n\n".join(partial_summaries)
-    # Егер combined тым ұзын болса — қысқаша summary
-    inputs = tokenizer([combined], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        summary_ids = model.generate(
-            inputs["input_ids"],
-            attention_mask=inputs.get("attention_mask", None),
-            num_beams=NUM_BEAMS,
-            min_length=SUMMARY_MIN_LENGTH,
-            max_length=SUMMARY_MAX_LENGTH,
-            early_stopping=True,
-            no_repeat_ngram_size=3
-        )
-    final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-    return final_summary