Files changed (9) hide show
  1. Dockerfile +0 -26
  2. fine_tune.py +0 -57
  3. index.html +0 -33
  4. main.py +0 -70
  5. pdf_reader.py +0 -47
  6. requirements.txt +0 -8
  7. script.js +0 -88
  8. styles.css +0 -69
  9. summarizer.py +0 -99
Dockerfile DELETED
@@ -1,26 +0,0 @@
1
- # Dockerfile
2
- FROM python:3.10-slim
3
-
4
- # System deps for PDF processing (optional)
5
- RUN apt-get update && apt-get install -y --no-install-recommends \
6
- build-essential \
7
- poppler-utils \
8
- && rm -rf /var/lib/apt/lists/*
9
-
10
- WORKDIR /app
11
-
12
- COPY . .
13
-
14
- # Install python deps
15
- RUN pip install --no-cache-dir -r requirements.txt
16
-
17
- # Экстра requirements (кейде қажет)
18
- RUN pip install --no-cache-dir torch transformers PyPDF2 pdfminer.six
19
-
20
- ENV PYTHONUNBUFFERED=1
21
- # Осы жерде модель атауын орнатуға болады:
22
- # ENV FINE_TUNED_MODEL=username/bart-finetuned-arxiv
23
-
24
- EXPOSE 7860
25
-
26
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fine_tune.py DELETED
@@ -1,57 +0,0 @@
1
- # fine_tune.py
2
- from datasets import load_dataset
3
- from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
4
- import os
5
-
6
- model_name = "facebook/bart-large-cnn"
7
- tokenizer = BartTokenizer.from_pretrained(model_name)
8
- model = BartForConditionalGeneration.from_pretrained(model_name)
9
-
10
- dataset = load_dataset("marcov/scientific_papers_arxiv_promptsource")
11
-
12
- # Күнделікті тест үшін шағын subset
13
- dataset["train"] = dataset["train"].select(range(1000))
14
- dataset["validation"] = dataset["validation"].select(range(200))
15
-
16
- max_input_length = 1024
17
- max_output_length = 200
18
-
19
- def preprocess_function(batch):
20
- inputs = tokenizer(batch["article"], max_length=max_input_length, truncation=True)
21
- outputs = tokenizer(batch["summary"], max_length=max_output_length, truncation=True)
22
- batch["input_ids"] = inputs["input_ids"]
23
- batch["attention_mask"] = inputs["attention_mask"]
24
- batch["labels"] = outputs["input_ids"]
25
- return batch
26
-
27
- tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
28
- tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
29
-
30
- training_args = TrainingArguments(
31
- output_dir="./bart-finetuned-arxiv-hub",
32
- evaluation_strategy="steps",
33
- eval_steps=500,
34
- save_steps=500,
35
- save_total_limit=2,
36
- learning_rate=3e-5,
37
- per_device_train_batch_size=2,
38
- per_device_eval_batch_size=2,
39
- num_train_epochs=3,
40
- weight_decay=0.01,
41
- fp16=False, # GPU болса True қой
42
- logging_dir="./logs",
43
- logging_steps=100,
44
- push_to_hub=True
45
- )
46
-
47
- trainer = Trainer(
48
- model=model,
49
- args=training_args,
50
- train_dataset=tokenized_train,
51
- eval_dataset=tokenized_val,
52
- tokenizer=tokenizer,
53
- )
54
-
55
- trainer.train()
56
- trainer.push_to_hub("username/bart-finetuned-arxiv")
57
- print("Fine-tuning complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
index.html DELETED
@@ -1,33 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8" />
5
- <title>Scientific Article Summarizer</title>
6
- <link rel="stylesheet" href="/static/styles.css" />
7
- </head>
8
- <body>
9
- <h1>Scientific Article Summarizer</h1>
10
-
11
- <div class="container">
12
- <h2>Summarize Text</h2>
13
- <textarea id="inputText" placeholder="Insert scientific text..."></textarea>
14
- <div class="row">
15
- <button id="summTextBtn" onclick="summarizeText()">Summarize Text</button>
16
- <button onclick="copySummary()">Copy Summary</button>
17
- </div>
18
-
19
- <h2>Summarize PDF</h2>
20
- <input type="file" id="pdfFile" accept="application/pdf" />
21
- <div class="row">
22
- <button id="summPdfBtn" onclick="summarizePDF()">Summarize PDF</button>
23
- </div>
24
-
25
- <h2>Summary Result</h2>
26
- <div id="summaryBox" style="white-space: pre-line;"></div>
27
-
28
- <div id="errorBox" class="error" hidden></div>
29
- </div>
30
-
31
- <script src="/static/script.js"></script>
32
- </body>
33
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py DELETED
@@ -1,70 +0,0 @@
1
- # main.py
2
- import os
3
- import logging
4
- from fastapi import FastAPI, UploadFile, File, HTTPException
5
- from fastapi.middleware.cors import CORSMiddleware
6
- from fastapi.responses import FileResponse, JSONResponse
7
- from fastapi.staticfiles import StaticFiles
8
- from starlette.status import HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
9
-
10
- from summarizer import generate_summary
11
- from pdf_reader import extract_text_from_pdf
12
-
13
- # Логтау
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger("summarizer_app")
16
-
17
- app = FastAPI(title="Scientific Article Summarizer")
18
-
19
- # CORS (frontend ↔ backend)
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_methods=["*"],
24
- allow_headers=["*"],
25
- )
26
-
27
- # Статикалық файлдар — жобаның түбірінен береді
28
- app.mount("/static", StaticFiles(directory="."), name="static")
29
-
30
- @app.get("/")
31
- async def root():
32
- return FileResponse("index.html")
33
-
34
- @app.post("/summarize/text")
35
- async def summarize_text(data: dict):
36
- text = data.get("text", "")
37
- if not text or not text.strip():
38
- raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="No text provided.")
39
- try:
40
- summary = generate_summary(text)
41
- return {"summary": summary}
42
- except Exception as e:
43
- logger.exception("Error while generating text summary")
44
- raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
45
-
46
- MAX_PDF_BYTES = int(os.getenv("MAX_PDF_BYTES", 10 * 1024 * 1024)) # 10 MB default
47
-
48
- @app.post("/summarize/pdf")
49
- async def summarize_pdf(file: UploadFile = File(...)):
50
- if not file.filename.lower().endswith(".pdf"):
51
- raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Only PDF files are supported.")
52
- pdf_bytes = await file.read()
53
- if len(pdf_bytes) == 0:
54
- raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail="Uploaded PDF is empty.")
55
- if len(pdf_bytes) > MAX_PDF_BYTES:
56
- raise HTTPException(status_code=HTTP_400_BAD_REQUEST, detail=f"PDF too large (max {MAX_PDF_BYTES} bytes).")
57
- try:
58
- text = extract_text_from_pdf(pdf_bytes)
59
- if not text or not text.strip():
60
- return {"summary": "PDF is empty or could not be processed."}
61
- summary = generate_summary(text)
62
- return {"summary": summary}
63
- except Exception as e:
64
- logger.exception("Error while processing PDF")
65
- raise HTTPException(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
66
-
67
- # Custom error handler example
68
- @app.exception_handler(HTTPException)
69
- async def http_exception_handler(request, exc: HTTPException):
70
- return JSONResponse(status_code=exc.status_code, content={"error": exc.detail})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdf_reader.py DELETED
@@ -1,47 +0,0 @@
1
- # pdf_reader.py
2
- import io
3
- from typing import Optional
4
-
5
- # Бірнеше кітапхана арқылы оқуды қолдаймыз — ең алдымен PyPDF2, егер қажет болса pdfminer.six
6
- try:
7
- from PyPDF2 import PdfReader
8
- _has_pypdf2 = True
9
- except Exception:
10
- _has_pypdf2 = False
11
-
12
- try:
13
- from pdfminer.high_level import extract_text as pdfminer_extract_text
14
- _has_pdfminer = True
15
- except Exception:
16
- _has_pdfminer = False
17
-
18
- def extract_text_from_pdf(pdf_bytes: bytes) -> str:
19
- """
20
- PDF-тен мәтін алу. Бірнеше әдісті қолданып көреді.
21
- """
22
- # 1) PyPDF2
23
- if _has_pypdf2:
24
- try:
25
- reader = PdfReader(io.BytesIO(pdf_bytes))
26
- text = ""
27
- for page in reader.pages:
28
- page_text = page.extract_text()
29
- if page_text:
30
- text += page_text + "\n"
31
- if text.strip():
32
- return text
33
- except Exception:
34
- pass
35
-
36
- # 2) pdfminer.six (әдетте күрделі PDF-тарға жақсы)
37
- if _has_pdfminer:
38
- try:
39
- text = pdfminer_extract_text(io.BytesIO(pdf_bytes))
40
- if text and text.strip():
41
- return text
42
- except Exception:
43
- pass
44
-
45
- # Егер ештеңе шықпаса — бос қайтарың
46
- return ""
47
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,8 +0,0 @@
1
- fastapi
2
- uvicorn[standard]
3
- transformers
4
- torch
5
- PyPDF2
6
- python-multipart
7
- pdfminer.six
8
-
 
 
 
 
 
 
 
 
 
script.js DELETED
@@ -1,88 +0,0 @@
1
- // script.js
2
- async function summarizeText() {
3
- clearMessages();
4
- const text = document.getElementById("inputText").value;
5
- const btn = document.getElementById("summTextBtn");
6
- btn.disabled = true;
7
- btn.innerText = "Summarizing...";
8
- try {
9
- const response = await fetch("/summarize/text", {
10
- method: "POST",
11
- headers: {"Content-Type": "application/json"},
12
- body: JSON.stringify({ text })
13
- });
14
- if (!response.ok) {
15
- const err = await response.json();
16
- showError(err.detail || "Error during summarization.");
17
- return;
18
- }
19
- const data = await response.json();
20
- document.getElementById("summaryBox").innerText = data.summary;
21
- } catch (e) {
22
- showError(e.message || "Network error.");
23
- } finally {
24
- btn.disabled = false;
25
- btn.innerText = "Summarize Text";
26
- }
27
- }
28
-
29
- async function summarizePDF() {
30
- clearMessages();
31
- const fileInput = document.getElementById("pdfFile");
32
- if (!fileInput.files.length) {
33
- showError("Please select a PDF file first.");
34
- return;
35
- }
36
- const file = fileInput.files[0];
37
- const formData = new FormData();
38
- formData.append("file", file);
39
-
40
- const btn = document.getElementById("summPdfBtn");
41
- btn.disabled = true;
42
- btn.innerText = "Summarizing...";
43
-
44
- try {
45
- const response = await fetch("/summarize/pdf", {
46
- method: "POST",
47
- body: formData
48
- });
49
- if (!response.ok) {
50
- const err = await response.json();
51
- showError(err.detail || "Error during PDF summarization.");
52
- return;
53
- }
54
- const data = await response.json();
55
- document.getElementById("summaryBox").innerText = data.summary;
56
- } catch (e) {
57
- showError(e.message || "Network error.");
58
- } finally {
59
- btn.disabled = false;
60
- btn.innerText = "Summarize PDF";
61
- }
62
- }
63
-
64
- function showError(msg) {
65
- const errBox = document.getElementById("errorBox");
66
- errBox.hidden = false;
67
- errBox.innerText = msg;
68
- }
69
-
70
- function clearMessages() {
71
- document.getElementById("summaryBox").innerText = "";
72
- const errBox = document.getElementById("errorBox");
73
- errBox.hidden = true;
74
- errBox.innerText = "";
75
- }
76
-
77
- function copySummary() {
78
- const text = document.getElementById("summaryBox").innerText;
79
- if (!text) {
80
- alert("There's no summary to copy.");
81
- return;
82
- }
83
- navigator.clipboard.writeText(text).then(() => {
84
- alert("Summary copied to clipboard.");
85
- }).catch(() => {
86
- alert("Failed to copy summary.");
87
- });
88
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
styles.css DELETED
@@ -1,69 +0,0 @@
1
- /* styles.css */
2
- body {
3
- font-family: Arial, sans-serif;
4
- background: #f1f1f1;
5
- padding: 20px;
6
- }
7
-
8
- h1 {
9
- text-align: center;
10
- }
11
-
12
- .container {
13
- max-width: 900px;
14
- margin: auto;
15
- background: white;
16
- padding: 20px;
17
- border-radius: 10px;
18
- box-shadow: 0 4px 12px rgba(0,0,0,0.06);
19
- }
20
-
21
- textarea {
22
- width: 100%;
23
- height: 180px;
24
- padding: 10px;
25
- border-radius: 6px;
26
- border: 1px solid #ddd;
27
- resize: vertical;
28
- }
29
-
30
- button {
31
- padding: 10px 20px;
32
- margin-top: 10px;
33
- cursor: pointer;
34
- border-radius: 6px;
35
- border: none;
36
- background: #2f6fed;
37
- color: white;
38
- font-weight: 600;
39
- }
40
-
41
- button:disabled {
42
- opacity: 0.6;
43
- cursor: not-allowed;
44
- }
45
-
46
- .row {
47
- display: flex;
48
- gap: 10px;
49
- align-items: center;
50
- margin-top: 8px;
51
- }
52
-
53
- #summaryBox {
54
- background: #fafafa;
55
- padding: 15px;
56
- border-radius: 5px;
57
- min-height: 120px;
58
- border: 1px solid #eee;
59
- margin-top: 10px;
60
- }
61
-
62
- .error {
63
- margin-top: 12px;
64
- color: #a94442;
65
- background: #f2dede;
66
- padding: 10px;
67
- border-radius: 6px;
68
- border: 1px solid #ebccd1;
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
summarizer.py DELETED
@@ -1,99 +0,0 @@
1
- # summarizer.py
2
- import os
3
- import math
4
- import torch
5
- from transformers import BartTokenizer, BartForConditionalGeneration
6
-
7
- # Конфигурация: fine-tuned модель атауы немесе default
8
- MODEL_NAME = os.getenv("FINE_TUNED_MODEL", "facebook/bart-large-cnn")
9
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
-
11
- # Инициализация (бір рет)
12
- tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
13
- model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
14
- model.eval()
15
-
16
- # Параметрлер
17
- MAX_INPUT_LENGTH = 1024
18
- SUMMARY_MIN_LENGTH = 40
19
- SUMMARY_MAX_LENGTH = 200
20
- NUM_BEAMS = 4
21
-
22
- def chunk_text(text: str, max_tokens: int = MAX_INPUT_LENGTH, overlap: int = 50):
23
- """
24
- Ұзын мәтінді токендер бойынша бөліп қайтару. overlap — әр кусок арасында қайталанатын токен саны.
25
- """
26
- inputs = tokenizer(text, return_tensors="pt", truncation=False)
27
- input_ids = inputs["input_ids"][0].tolist()
28
- chunks = []
29
- start = 0
30
- while start < len(input_ids):
31
- end = start + max_tokens
32
- chunk_ids = input_ids[start:end]
33
- chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
34
- chunks.append(chunk_text)
35
- if end >= len(input_ids):
36
- break
37
- start = end - overlap
38
- return chunks
39
-
40
- def generate_summary(text: str) -> str:
41
- """
42
- Егер мәтін MAX_INPUT_LENGTH-тен ұзын болса — бөліп, әр бөліктің summary алып,
43
- содан кейін қысқа unified summary қайтару.
44
- """
45
- text = text.strip()
46
- if not text:
47
- return ""
48
-
49
- # Егер қысқа — тікелей summary
50
- tokens = tokenizer(text, max_length=1, truncation=False)
51
- # Қарапайым жүктеме: егер мәтін қысқа — бір шақыру
52
- if len(tokenizer.encode(text)) <= MAX_INPUT_LENGTH:
53
- inputs = tokenizer([text], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
54
- with torch.no_grad():
55
- summary_ids = model.generate(
56
- inputs["input_ids"],
57
- attention_mask=inputs.get("attention_mask", None),
58
- num_beams=NUM_BEAMS,
59
- min_length=SUMMARY_MIN_LENGTH,
60
- max_length=SUMMARY_MAX_LENGTH,
61
- early_stopping=True,
62
- no_repeat_ngram_size=3
63
- )
64
- return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
65
-
66
- # Ұзын мәтін: бөліп, әр бөлімнің summary алып, содан кейін агрегаттау
67
- chunks = chunk_text(text, max_tokens=MAX_INPUT_LENGTH, overlap=64)
68
- partial_summaries = []
69
- for chunk in chunks:
70
- inputs = tokenizer([chunk], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
71
- with torch.no_grad():
72
- summary_ids = model.generate(
73
- inputs["input_ids"],
74
- attention_mask=inputs.get("attention_mask", None),
75
- num_beams=NUM_BEAMS,
76
- min_length=SUMMARY_MIN_LENGTH // 2,
77
- max_length=SUMMARY_MAX_LENGTH,
78
- early_stopping=True,
79
- no_repeat_ngram_size=3
80
- )
81
- s = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
82
- partial_summaries.append(s)
83
-
84
- # Біріктіру: partial_summaries-тан соңғы қысқаша summary жасау
85
- combined = "\n\n".join(partial_summaries)
86
- # Егер combined тым ұзын болса — қысқаша summary
87
- inputs = tokenizer([combined], max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt").to(DEVICE)
88
- with torch.no_grad():
89
- summary_ids = model.generate(
90
- inputs["input_ids"],
91
- attention_mask=inputs.get("attention_mask", None),
92
- num_beams=NUM_BEAMS,
93
- min_length=SUMMARY_MIN_LENGTH,
94
- max_length=SUMMARY_MAX_LENGTH,
95
- early_stopping=True,
96
- no_repeat_ngram_size=3
97
- )
98
- final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
99
- return final_summary