app
#1
by
student2222333051
- opened
- Dockerfile +0 -26
- fine_tune.py +0 -57
- index.html +0 -33
- main.py +0 -70
- pdf_reader.py +0 -47
- requirements.txt +0 -8
- script.js +0 -88
- styles.css +0 -69
- summarizer.py +0 -99
Dockerfile
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
# Dockerfile
|
| 2 |
-
FROM python:3.10-slim
|
| 3 |
-
|
| 4 |
-
# System deps for PDF processing (optional)
|
| 5 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
-
build-essential \
|
| 7 |
-
poppler-utils \
|
| 8 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
-
|
| 10 |
-
WORKDIR /app
|
| 11 |
-
|
| 12 |
-
COPY . .
|
| 13 |
-
|
| 14 |
-
# Install python deps
|
| 15 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
-
|
| 17 |
-
# Экстра requirements (кейде қажет)
|
| 18 |
-
RUN pip install --no-cache-dir torch transformers PyPDF2 pdfminer.six
|
| 19 |
-
|
| 20 |
-
ENV PYTHONUNBUFFERED=1
|
| 21 |
-
# Осы жерде модель атауын орнатуға болады:
|
| 22 |
-
# ENV FINE_TUNED_MODEL=username/bart-finetuned-arxiv
|
| 23 |
-
|
| 24 |
-
EXPOSE 7860
|
| 25 |
-
|
| 26 |
-
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fine_tune.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
# fine_tune.py
|
| 2 |
-
from datasets import load_dataset
|
| 3 |
-
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
model_name = "facebook/bart-large-cnn"
|
| 7 |
-
tokenizer = BartTokenizer.from_pretrained(model_name)
|
| 8 |
-
model = BartForConditionalGeneration.from_pretrained(model_name)
|
| 9 |
-
|
| 10 |
-
dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
|
| 11 |
-
|
| 12 |
-
# Күнделікті тест үшін шағын subset
|
| 13 |
-
dataset["train"] = dataset["train"].select(range(1000))
|
| 14 |
-
dataset["validation"] = dataset["validation"].select(range(200))
|
| 15 |
-
|
| 16 |
-
max_input_length = 1024
|
| 17 |
-
max_output_length = 200
|
| 18 |
-
|
| 19 |
-
def preprocess_function(batch):
|
| 20 |
-
inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
|
| 21 |
-
outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
|
| 22 |
-
batch["input_ids"] = inputs["input_ids"]
|
| 23 |
-
batch["attention_mask"] = inputs["attention_mask"]
|
| 24 |
-
batch["labels"] = outputs["input_ids"]
|
| 25 |
-
return batch
|
| 26 |
-
|
| 27 |
-
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
|
| 28 |
-
tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
|
| 29 |
-
|
| 30 |
-
training_args = TrainingArguments(
|
| 31 |
-
output_dir="./bart-finetuned-arxiv-hub",
|
| 32 |
-
evaluation_strategy="steps",
|
| 33 |
-
eval_steps=500,
|
| 34 |
-
save_steps=500,
|
| 35 |
-
save_total_limit=2,
|
| 36 |
-
learning_rate=3e-5,
|
| 37 |
-
per_device_train_batch_size=2,
|
| 38 |
-
per_device_eval_batch_size=2,
|
| 39 |
-
num_train_epochs=3,
|
| 40 |
-
weight_decay=0.01,
|
| 41 |
-
fp16=False, # GPU болса True қой
|
| 42 |
-
logging_dir="./logs",
|
| 43 |
-
logging_steps=100,
|
| 44 |
-
push_to_hub=True
|
| 45 |
-
)
|
| 46 |
-
|
| 47 |
-
trainer = Trainer(
|
| 48 |
-
model=model,
|
| 49 |
-
args=training_args,
|
| 50 |
-
train_dataset=tokenized_train,
|
| 51 |
-
eval_dataset=tokenized_val,
|
| 52 |
-
tokenizer=tokenizer,
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
trainer.train()
|
| 56 |
-
trainer.push_to_hub("username/bart-finetuned-arxiv")
|
| 57 |
-
print("Fine-tuning complete.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
index.html
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="UTF-8" />
|
| 5 |
-
<title>Scientific Article Summarizer</title>
|
| 6 |
-
<link rel="stylesheet" href="/static/styles.css" />
|
| 7 |
-
</head>
|
| 8 |
-
<body>
|
| 9 |
-
<h1>Scientific Article Summarizer</h1>
|
| 10 |
-
|
| 11 |
-
<div class="container">
|
| 12 |
-
<h2>Summarize Text</h2>
|
| 13 |
-
<textarea id="inputText" placeholder="Insert scientific text..."></textarea>
|
| 14 |
-
<div class="row">
|
| 15 |
-
<button id="summTextBtn" onclick="summarizeText()">Summarize Text</button>
|
| 16 |
-
<button onclick="copySummary()">Copy Summary</button>
|
| 17 |
-
</div>
|
| 18 |
-
|
| 19 |
-
<h2>Summarize PDF</h2>
|
| 20 |
-
<input type="file" id="pdfFile" accept="application/pdf" />
|
| 21 |
-
<div class="row">
|
| 22 |
-
<button id="summPdfBtn" onclick="summarizePDF()">Summarize PDF</button>
|
| 23 |
-
</div>
|
| 24 |
-
|
| 25 |
-
<h2>Summary Result</h2>
|
| 26 |
-
<div id="summaryBox" style="white-space: pre-line;"></div>
|
| 27 |
-
|
| 28 |
-
<div id="errorBox" class="error" hidden></div>
|
| 29 |
-
</div>
|
| 30 |
-
|
| 31 |
-
<script src="/static/script.js"></script>
|
| 32 |
-
</body>
|
| 33 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
DELETED
|
@@ -1,70 +0,0 @@
|
|
| 1 |
-
# main.py
|
| 2 |
-
import os
|
| 3 |
-
import logging
|
| 4 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 5 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
-
from fastapi.responses import FileResponse, JSONResponse
|
| 7 |
-
from fastapi.staticfiles import StaticFiles
|
| 8 |
-
from starlette.status import HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
|
| 9 |
-
|
| 10 |
-
from summarizer import generate_summary
|
| 11 |
-
from pdf_reader import extract_text_from_pdf
|
| 12 |
-
|
| 13 |
-
# Логтау
|
| 14 |
-
logging.basicConfig(level=logging.INFO)
|
| 15 |
-
logger = logging.getLogger("summarizer_app")
|
| 16 |
-
|
| 17 |
-
app = FastAPI(title="Scientific Article Summarizer")
|
| 18 |
-
|
| 19 |
-
# CORS (frontend ↔ backend)
|
| 20 |
-
app.add_middleware(
|
| 21 |
-
CORSMiddleware,
|
| 22 |
-
allow_origins=["*"],
|
| 23 |
-
allow_methods=["*"],
|
| 24 |
-
allow_headers=["*"],
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
# Статикалық файлдар — жобаның түбірінен береді
|
| 28 |
-
app.mount("/static", StaticFiles(directory="."), name="static")
|
| 29 |
-
|
| 30 |
-
@app.get("/")
|
| 31 |
-
async def root():
|
| 32 |
-
return FileResponse("index.html")
|
| 33 |
-
|
| 34 |
-
@app.post("/summarize/text")
|
| 35 |
-
async def summarize_text(data: dict):
|
| 36 |
-
text = data.get("text", "")
|
| 37 |
-
if not text or not text.strip():
|
| 38 |
-
raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="No text provided.")
|
| 39 |
-
try:
|
| 40 |
-
summary = generate_summary(text)
|
| 41 |
-
return {"summary": summary}
|
| 42 |
-
except Exception as e:
|
| 43 |
-
logger.exception("Error while generating text summary")
|
| 44 |
-
raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
| 45 |
-
|
| 46 |
-
MAX_PDF_BYTES = int(os.getenv("MAX_PDF_BYTES", 10 * 1024 * 1024)) # 10 MB default
|
| 47 |
-
|
| 48 |
-
@app.post("/summarize/pdf")
|
| 49 |
-
async def summarize_pdf(file: UploadFile = File(...)):
|
| 50 |
-
if not file.filename.lower().endswith(".pdf"):
|
| 51 |
-
raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Only PDF files are supported.")
|
| 52 |
-
pdf_bytes = await file.read()
|
| 53 |
-
if len(pdf_bytes) == 0:
|
| 54 |
-
raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Uploaded PDF is empty.")
|
| 55 |
-
if len(pdf_bytes) > MAX_PDF_BYTES:
|
| 56 |
-
raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail=f"PDF too large (max {MAX_PDF_BYTES} bytes).")
|
| 57 |
-
try:
|
| 58 |
-
text = extract_text_from_pdf(pdf_bytes)
|
| 59 |
-
if not text or not text.strip():
|
| 60 |
-
return {"summary": "PDF is empty or could not be processed."}
|
| 61 |
-
summary = generate_summary(text)
|
| 62 |
-
return {"summary": summary}
|
| 63 |
-
except Exception as e:
|
| 64 |
-
logger.exception("Error while processing PDF")
|
| 65 |
-
raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
| 66 |
-
|
| 67 |
-
# Custom error handler example
|
| 68 |
-
@app.exception_handler(HTTPException)
|
| 69 |
-
async def http_exception_handler(request, exc: HTTPException):
|
| 70 |
-
return JSONResponse(status_code=exc.status_code, content={"error": exc.detail})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_reader.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
# pdf_reader.py
|
| 2 |
-
import io
|
| 3 |
-
from typing import Optional
|
| 4 |
-
|
| 5 |
-
# Бірнеше кітапхана арқылы оқуды қолдаймыз — ең алдымен PyPDF2, егер қажет болса pdfminer.six
|
| 6 |
-
try:
|
| 7 |
-
from PyPDF2 import PdfReader
|
| 8 |
-
_has_pypdf2 = True
|
| 9 |
-
except Exception:
|
| 10 |
-
_has_pypdf2 = False
|
| 11 |
-
|
| 12 |
-
try:
|
| 13 |
-
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
| 14 |
-
_has_pdfminer = True
|
| 15 |
-
except Exception:
|
| 16 |
-
_has_pdfminer = False
|
| 17 |
-
|
| 18 |
-
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
| 19 |
-
"""
|
| 20 |
-
PDF-тен мәтін алу. Бірнеше әдісті қолданып көреді.
|
| 21 |
-
"""
|
| 22 |
-
# 1) PyPDF2
|
| 23 |
-
if _has_pypdf2:
|
| 24 |
-
try:
|
| 25 |
-
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 26 |
-
text = ""
|
| 27 |
-
for page in reader.pages:
|
| 28 |
-
page_text = page.extract_text()
|
| 29 |
-
if page_text:
|
| 30 |
-
text += page_text + "\n"
|
| 31 |
-
if text.strip():
|
| 32 |
-
return text
|
| 33 |
-
except Exception:
|
| 34 |
-
pass
|
| 35 |
-
|
| 36 |
-
# 2) pdfminer.six (әдетте күрделі PDF-тарға жақсы)
|
| 37 |
-
if _has_pdfminer:
|
| 38 |
-
try:
|
| 39 |
-
text = pdfminer_extract_text(io.BytesIO(pdf_bytes))
|
| 40 |
-
if text and text.strip():
|
| 41 |
-
return text
|
| 42 |
-
except Exception:
|
| 43 |
-
pass
|
| 44 |
-
|
| 45 |
-
# Егер ештеңе шықпаса — бос қайтарың
|
| 46 |
-
return ""
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn[standard]
|
| 3 |
-
transformers
|
| 4 |
-
torch
|
| 5 |
-
PyPDF2
|
| 6 |
-
python-multipart
|
| 7 |
-
pdfminer.six
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script.js
DELETED
|
@@ -1,88 +0,0 @@
|
|
| 1 |
-
// script.js
|
| 2 |
-
async function summarizeText() {
|
| 3 |
-
clearMessages();
|
| 4 |
-
const text = document.getElementById("inputText").value;
|
| 5 |
-
const btn = document.getElementById("summTextBtn");
|
| 6 |
-
btn.disabled = true;
|
| 7 |
-
btn.innerText = "Summarizing...";
|
| 8 |
-
try {
|
| 9 |
-
const response = await fetch("/summarize/text", {
|
| 10 |
-
method: "POST",
|
| 11 |
-
headers: {"Content-Type": "application/json"},
|
| 12 |
-
body: JSON.stringify({ text })
|
| 13 |
-
});
|
| 14 |
-
if (!response.ok) {
|
| 15 |
-
const err = await response.json();
|
| 16 |
-
showError(err.detail || "Error during summarization.");
|
| 17 |
-
return;
|
| 18 |
-
}
|
| 19 |
-
const data = await response.json();
|
| 20 |
-
document.getElementById("summaryBox").innerText = data.summary;
|
| 21 |
-
} catch (e) {
|
| 22 |
-
showError(e.message || "Network error.");
|
| 23 |
-
} finally {
|
| 24 |
-
btn.disabled = false;
|
| 25 |
-
btn.innerText = "Summarize Text";
|
| 26 |
-
}
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
async function summarizePDF() {
|
| 30 |
-
clearMessages();
|
| 31 |
-
const fileInput = document.getElementById("pdfFile");
|
| 32 |
-
if (!fileInput.files.length) {
|
| 33 |
-
showError("Please select a PDF file first.");
|
| 34 |
-
return;
|
| 35 |
-
}
|
| 36 |
-
const file = fileInput.files[0];
|
| 37 |
-
const formData = new FormData();
|
| 38 |
-
formData.append("file", file);
|
| 39 |
-
|
| 40 |
-
const btn = document.getElementById("summPdfBtn");
|
| 41 |
-
btn.disabled = true;
|
| 42 |
-
btn.innerText = "Summarizing...";
|
| 43 |
-
|
| 44 |
-
try {
|
| 45 |
-
const response = await fetch("/summarize/pdf", {
|
| 46 |
-
method: "POST",
|
| 47 |
-
body: formData
|
| 48 |
-
});
|
| 49 |
-
if (!response.ok) {
|
| 50 |
-
const err = await response.json();
|
| 51 |
-
showError(err.detail || "Error during PDF summarization.");
|
| 52 |
-
return;
|
| 53 |
-
}
|
| 54 |
-
const data = await response.json();
|
| 55 |
-
document.getElementById("summaryBox").innerText = data.summary;
|
| 56 |
-
} catch (e) {
|
| 57 |
-
showError(e.message || "Network error.");
|
| 58 |
-
} finally {
|
| 59 |
-
btn.disabled = false;
|
| 60 |
-
btn.innerText = "Summarize PDF";
|
| 61 |
-
}
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
function showError(msg) {
|
| 65 |
-
const errBox = document.getElementById("errorBox");
|
| 66 |
-
errBox.hidden = false;
|
| 67 |
-
errBox.innerText = msg;
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
function clearMessages() {
|
| 71 |
-
document.getElementById("summaryBox").innerText = "";
|
| 72 |
-
const errBox = document.getElementById("errorBox");
|
| 73 |
-
errBox.hidden = true;
|
| 74 |
-
errBox.innerText = "";
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
function copySummary() {
|
| 78 |
-
const text = document.getElementById("summaryBox").innerText;
|
| 79 |
-
if (!text) {
|
| 80 |
-
alert("There's no summary to copy.");
|
| 81 |
-
return;
|
| 82 |
-
}
|
| 83 |
-
navigator.clipboard.writeText(text).then(() => {
|
| 84 |
-
alert("Summary copied to clipboard.");
|
| 85 |
-
}).catch(() => {
|
| 86 |
-
alert("Failed to copy summary.");
|
| 87 |
-
});
|
| 88 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
styles.css
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
/* styles.css */
|
| 2 |
-
body {
|
| 3 |
-
font-family: Arial, sans-serif;
|
| 4 |
-
background: #f1f1f1;
|
| 5 |
-
padding: 20px;
|
| 6 |
-
}
|
| 7 |
-
|
| 8 |
-
h1 {
|
| 9 |
-
text-align: center;
|
| 10 |
-
}
|
| 11 |
-
|
| 12 |
-
.container {
|
| 13 |
-
max-width: 900px;
|
| 14 |
-
margin: auto;
|
| 15 |
-
background: white;
|
| 16 |
-
padding: 20px;
|
| 17 |
-
border-radius: 10px;
|
| 18 |
-
box-shadow: 0 4px 12px rgba(0,0,0,0.06);
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
textarea {
|
| 22 |
-
width: 100%;
|
| 23 |
-
height: 180px;
|
| 24 |
-
padding: 10px;
|
| 25 |
-
border-radius: 6px;
|
| 26 |
-
border: 1px solid #ddd;
|
| 27 |
-
resize: vertical;
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
button {
|
| 31 |
-
padding: 10px 20px;
|
| 32 |
-
margin-top: 10px;
|
| 33 |
-
cursor: pointer;
|
| 34 |
-
border-radius: 6px;
|
| 35 |
-
border: none;
|
| 36 |
-
background: #2f6fed;
|
| 37 |
-
color: white;
|
| 38 |
-
font-weight: 600;
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
button:disabled {
|
| 42 |
-
opacity: 0.6;
|
| 43 |
-
cursor: not-allowed;
|
| 44 |
-
}
|
| 45 |
-
|
| 46 |
-
.row {
|
| 47 |
-
display: flex;
|
| 48 |
-
gap: 10px;
|
| 49 |
-
align-items: center;
|
| 50 |
-
margin-top: 8px;
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
#summaryBox {
|
| 54 |
-
background: #fafafa;
|
| 55 |
-
padding: 15px;
|
| 56 |
-
border-radius: 5px;
|
| 57 |
-
min-height: 120px;
|
| 58 |
-
border: 1px solid #eee;
|
| 59 |
-
margin-top: 10px;
|
| 60 |
-
}
|
| 61 |
-
|
| 62 |
-
.error {
|
| 63 |
-
margin-top: 12px;
|
| 64 |
-
color: #a94442;
|
| 65 |
-
background: #f2dede;
|
| 66 |
-
padding: 10px;
|
| 67 |
-
border-radius: 6px;
|
| 68 |
-
border: 1px solid #ebccd1;
|
| 69 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summarizer.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
# summarizer.py
|
| 2 |
-
import os
|
| 3 |
-
import math
|
| 4 |
-
import torch
|
| 5 |
-
from transformers import BartTokenizer, BartForConditionalGeneration
|
| 6 |
-
|
| 7 |
-
# Конфигурация: fine-tuned модель атауы немесе default
|
| 8 |
-
MODEL_NAME = os.getenv("FINE_TUNED_MODEL", "facebook/bart-large-cnn")
|
| 9 |
-
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
-
|
| 11 |
-
# Инициализация (бір рет)
|
| 12 |
-
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
|
| 13 |
-
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
|
| 14 |
-
model.eval()
|
| 15 |
-
|
| 16 |
-
# Параметрлер
|
| 17 |
-
MAX_INPUT_LENGTH = 1024
|
| 18 |
-
SUMMARY_MIN_LENGTH = 40
|
| 19 |
-
SUMMARY_MAX_LENGTH = 200
|
| 20 |
-
NUM_BEAMS = 4
|
| 21 |
-
|
| 22 |
-
def chunk_text(text: str, max_tokens: int = MAX_INPUT_LENGTH, overlap: int = 50):
|
| 23 |
-
"""
|
| 24 |
-
Ұзын мәтінді токендер бойынша бөліп қайтару. overlap — әр кусок арасында қайталанатын токен саны.
|
| 25 |
-
"""
|
| 26 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=False)
|
| 27 |
-
input_ids = inputs["input_ids"][0].tolist()
|
| 28 |
-
chunks = []
|
| 29 |
-
start = 0
|
| 30 |
-
while start < len(input_ids):
|
| 31 |
-
end = start + max_tokens
|
| 32 |
-
chunk_ids = input_ids[start:end]
|
| 33 |
-
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 34 |
-
chunks.append(chunk_text)
|
| 35 |
-
if end >= len(input_ids):
|
| 36 |
-
break
|
| 37 |
-
start = end - overlap
|
| 38 |
-
return chunks
|
| 39 |
-
|
| 40 |
-
def generate_summary(text: str) -> str:
|
| 41 |
-
"""
|
| 42 |
-
Егер мәтін MAX_INPUT_LENGTH-тен ұзын болса — бөліп, әр бөліктің summary алып,
|
| 43 |
-
содан кейін қысқа unified summary қайтару.
|
| 44 |
-
"""
|
| 45 |
-
text = text.strip()
|
| 46 |
-
if not text:
|
| 47 |
-
return ""
|
| 48 |
-
|
| 49 |
-
# Егер қысқа — тікелей summary
|
| 50 |
-
tokens = tokenizer(text, max_length=1, truncation=False)
|
| 51 |
-
# Қарапайым жүктеме: егер мәтін қысқа — бір шақыру
|
| 52 |
-
if len(tokenizer.encode(text)) <= MAX_INPUT_LENGTH:
|
| 53 |
-
inputs = tokenizer([text], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
|
| 54 |
-
with torch.no_grad():
|
| 55 |
-
summary_ids = model.generate(
|
| 56 |
-
inputs["input_ids"],
|
| 57 |
-
attention_mask=inputs.get("attention_mask", None),
|
| 58 |
-
num_beams=NUM_BEAMS,
|
| 59 |
-
min_length=SUMMARY_MIN_LENGTH,
|
| 60 |
-
max_length=SUMMARY_MAX_LENGTH,
|
| 61 |
-
early_stopping=True,
|
| 62 |
-
no_repeat_ngram_size=3
|
| 63 |
-
)
|
| 64 |
-
return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 65 |
-
|
| 66 |
-
# Ұзын мәтін: бөліп, әр бөлімнің summary алып, содан кейін агрегаттау
|
| 67 |
-
chunks = chunk_text(text, max_tokens=MAX_INPUT_LENGTH, overlap=64)
|
| 68 |
-
partial_summaries = []
|
| 69 |
-
for chunk in chunks:
|
| 70 |
-
inputs = tokenizer([chunk], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
|
| 71 |
-
with torch.no_grad():
|
| 72 |
-
summary_ids = model.generate(
|
| 73 |
-
inputs["input_ids"],
|
| 74 |
-
attention_mask=inputs.get("attention_mask", None),
|
| 75 |
-
num_beams=NUM_BEAMS,
|
| 76 |
-
min_length=SUMMARY_MIN_LENGTH // 2,
|
| 77 |
-
max_length=SUMMARY_MAX_LENGTH,
|
| 78 |
-
early_stopping=True,
|
| 79 |
-
no_repeat_ngram_size=3
|
| 80 |
-
)
|
| 81 |
-
s = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 82 |
-
partial_summaries.append(s)
|
| 83 |
-
|
| 84 |
-
# Біріктіру: partial_summaries-тан соңғы қысқаша summary жасау
|
| 85 |
-
combined = "\n\n".join(partial_summaries)
|
| 86 |
-
# Егер combined тым ұзын болса — қысқаша summary
|
| 87 |
-
inputs = tokenizer([combined], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
|
| 88 |
-
with torch.no_grad():
|
| 89 |
-
summary_ids = model.generate(
|
| 90 |
-
inputs["input_ids"],
|
| 91 |
-
attention_mask=inputs.get("attention_mask", None),
|
| 92 |
-
num_beams=NUM_BEAMS,
|
| 93 |
-
min_length=SUMMARY_MIN_LENGTH,
|
| 94 |
-
max_length=SUMMARY_MAX_LENGTH,
|
| 95 |
-
early_stopping=True,
|
| 96 |
-
no_repeat_ngram_size=3
|
| 97 |
-
)
|
| 98 |
-
final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 99 |
-
return final_summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|