yunus789 commited on
Commit
7582b13
·
verified ·
1 Parent(s): 50f5b28

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +1 -0
  2. grammar_service.py +182 -0
  3. main.py +103 -0
  4. pdf_service.py +41 -0
  5. requirements.txt +13 -0
  6. summarize_service.py +45 -0
  7. surya_ocr.py +36 -0
app.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from main import app
grammar_service.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # services/grammar_service.py
2
+ import re
3
+ from functools import lru_cache
4
+
5
+ # Optional: jika ada modul kbbi, gunakan. Jika tidak — lewati cek KBBI.
6
+ try:
7
+ from kbbi import KBBI
8
+ _HAS_KBBI = True
9
+ except Exception:
10
+ _HAS_KBBI = False
11
+
12
+
13
+ @lru_cache(maxsize=10000)
14
+ def cek_kbbi(kata: str) -> bool:
15
+ """
16
+ Cek KBBI jika tersedia. Untuk kata pendek (<=2) return True agar tidak memblokir.
17
+ """
18
+ kata = kata.strip()
19
+ if not kata or len(kata) <= 2:
20
+ return True
21
+ if not _HAS_KBBI:
22
+ # Jika KBBI tidak tersedia, kembalikan False untuk kata yang tampak digabung 'diX'?
23
+ return False
24
+ try:
25
+ KBBI(kata)
26
+ return True
27
+ except Exception:
28
+ return False
29
+
30
+
31
+ # Simplified dictionaries (bisa kamu perluas kembali)
32
+ DI_LOKASI = {
33
+ "rumah", "kantor", "sekolah", "kampus", "pasar", "kota", "desa", "kabupaten",
34
+ "provinsi", "negara", "daerah", "jalan", "kamar", "dapur", "teras", "taman",
35
+ "toilet", "wc", "garasi", "hotel", "restoran", "kafe", "stasiun",
36
+ "bandara", "terminal", "perpustakaan", "lapangan", "rumah sakit", "bank",
37
+ }
38
+
39
+ KATA_KERJA_UMUM = {
40
+ "ambil", "taruh", "angkat", "makan", "minum", "bawa", "buat", "beri", "jual", "beli",
41
+ "tulis", "hapus", "cetak", "masak", "pakai", "bakar", "dorong", "tarik", "tutup",
42
+ "buka", "cuci", "tolong", "lari", "jalan", "duduk", "berdiri", "naik", "turun",
43
+ "klik", "ketik", "upload", "download", "kirim", "simpan", "cari", "parkir",
44
+ }
45
+
46
+
47
+ def _find_sentence_starts(text: str):
48
+ """
49
+ Menghasilkan match untuk kata pertama tiap kalimat (pos dan kata).
50
+ Kalimat dianggap dimulai di awal teks atau setelah .!? diikuti spasi.
51
+ """
52
+ pattern = re.compile(r'(^|[\.!?]\s+)([^\s])', flags=re.MULTILINE | re.UNICODE)
53
+ for m in pattern.finditer(text):
54
+ yield m.start(2), m.group(2)
55
+
56
+
57
+ def check_grammar(text: str):
58
+ """
59
+ Mengembalikan dict:
60
+ {
61
+ "corrected_text": "...",
62
+ "errors": [
63
+ {"start": int, "end": int, "original": "...", "suggestion":"...", "message":"..."},
64
+ ...
65
+ ]
66
+ }
67
+ Deteksi:
68
+ - Huruf awal kalimat harus kapital
69
+ - 'di' yang salah pisah/gabung (sederhana)
70
+ - spasi ganda
71
+ - spasi sebelum tanda baca
72
+ - kata + 'nya' yang seharusnya digabung (cek KBBI jika tersedia)
73
+ """
74
+ if not text:
75
+ return {"corrected_text": "", "errors": []}
76
+
77
+ errors = []
78
+
79
+ # 1) Huruf awal kalimat kecil -> sarankan kapitalisasi kata awal
80
+ for pos, char in _find_sentence_starts(text):
81
+ if char.isalpha() and char.islower():
82
+ # ambil kata lengkap dari posisi pos
83
+ m_word = re.match(r'[^\s\.,;:!?()"\']+', text[pos:])
84
+ if m_word:
85
+ word = m_word.group(0)
86
+ start = pos
87
+ end = pos + len(word)
88
+ suggestion = word[0].upper() + word[1:] if len(word) > 0 else word.upper()
89
+ errors.append({
90
+ "start": start, "end": end,
91
+ "original": text[start:end],
92
+ "suggestion": suggestion,
93
+ "message": "Huruf pertama kalimat harus kapital"
94
+ })
95
+
96
+ # 2) Spasi sebelum tanda baca -> hapus spasi
97
+ for m in re.finditer(r'\s+([,.:;!?])', text):
98
+ start, end = m.start(0), m.end(0)
99
+ # suggestion: punctuation saja (tanpa spasi)
100
+ suggestion = m.group(1)
101
+ errors.append({
102
+ "start": start, "end": end,
103
+ "original": text[start:end],
104
+ "suggestion": suggestion,
105
+ "message": "Hapus spasi sebelum tanda baca"
106
+ })
107
+
108
+ # 3) Spasi ganda -> ganti jadi satu spasi
109
+ for m in re.finditer(r' {2,}', text):
110
+ start, end = m.start(), m.end()
111
+ suggestion = " "
112
+ errors.append({
113
+ "start": start, "end": end,
114
+ "original": text[start:end],
115
+ "suggestion": suggestion,
116
+ "message": "Spasi ganda — gunakan satu spasi"
117
+ })
118
+
119
+ # 4) 'di' salah pisah / harus digabung
120
+ for m in re.finditer(r'\bdi\s+([^\s,\.!?;:()"\']+)', text, flags=re.IGNORECASE):
121
+ kata = m.group(1)
122
+ start, end = m.start(0), m.end(0)
123
+ kata_lower = kata.lower()
124
+
125
+ # jika kata kerja umum -> gabung
126
+ if kata_lower in KATA_KERJA_UMUM:
127
+ suggestion = "di" + kata
128
+ errors.append({
129
+ "start": start, "end": end,
130
+ "original": text[start:end],
131
+ "suggestion": suggestion,
132
+ "message": f"Gabungkan 'di' dengan kata kerja (bentuk baku: 'di{kata_lower}')"
133
+ })
134
+ else:
135
+ # jika KBBI ada dan 'di'+kata adalah entry baku, sarankan gabung
136
+ if cek_kbbi("di" + kata_lower):
137
+ suggestion = "di" + kata
138
+ errors.append({
139
+ "start": start, "end": end,
140
+ "original": text[start:end],
141
+ "suggestion": suggestion,
142
+ "message": "Kemungkinan kata baku 'di'+kata seharusnya digabung"
143
+ })
144
+
145
+ # 5) 'masak nya' -> 'masaknya'
146
+ for m in re.finditer(r'\b([^\s,\.!?;:()"\']+)\s+nya\b', text, flags=re.IGNORECASE):
147
+ dasar = m.group(1)
148
+ gab = (dasar + "nya").lower()
149
+ if cek_kbbi(gab):
150
+ start, end = m.start(0), m.end(0)
151
+ suggestion = dasar + "nya"
152
+ errors.append({
153
+ "start": start, "end": end,
154
+ "original": text[start:end],
155
+ "suggestion": suggestion,
156
+ "message": f"Gabungkan kata dan 'nya' menjadi '{suggestion}'"
157
+ })
158
+
159
+ # Deduplicate errors
160
+ seen = set()
161
+ unique_errors = []
162
+ for e in errors:
163
+ key = (e["start"], e["end"], e["suggestion"])
164
+ if key not in seen:
165
+ seen.add(key)
166
+ unique_errors.append(e)
167
+
168
+ # Sort by start position
169
+ unique_errors.sort(key=lambda x: x["start"])
170
+
171
+ # Build corrected_text
172
+ corrected = text
173
+ edits = [(e["start"], e["end"], e["suggestion"]) for e in unique_errors]
174
+ edits.sort(key=lambda t: t[0], reverse=True)
175
+
176
+ for s, e, sug in edits:
177
+ corrected = corrected[:s] + sug + corrected[e:]
178
+
179
+ return {
180
+ "corrected_text": corrected,
181
+ "errors": unique_errors
182
+ }
main.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import os
3
+ import asyncio
4
+ from fastapi import FastAPI, UploadFile, File, Form
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import FileResponse
7
+ from typing import List
8
+
9
+ # --- Import services ---
10
+ from services.surya_ocr import ocr_surya
11
+ from services.grammar_service import check_grammar
12
+ from services.summarize_service import summarize_text
13
+ from services.pdf_service import export_pdf
14
+
15
+ app = FastAPI(title="NoteBoard AI Backend")
16
+
17
+ # ---------------------------------------------------------
18
+ # CORS
19
+ # ---------------------------------------------------------
20
+ ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*")
21
+
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=[origin.strip() for origin in ALLOWED_ORIGINS.split(",")],
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+
30
+
31
+ # ---------------------------------------------------------
32
+ # OCR Endpoint
33
+ # ---------------------------------------------------------
34
+ @app.post("/ocr")
35
+ async def ocr_endpoint(files: List[UploadFile] = File(...)):
36
+ """
37
+ Menerima beberapa file dan mengembalikan hasil OCR (list).
38
+ """
39
+ results = []
40
+
41
+ for file in files:
42
+ content = await file.read()
43
+ text = await ocr_surya(content) # Surya OCR async
44
+ results.append(text)
45
+
46
+ return {"results": results}
47
+
48
+
49
+ # ---------------------------------------------------------
50
+ # Grammar Check Endpoint - DIPERBAIKI
51
+ # ---------------------------------------------------------
52
+ @app.post("/grammar")
53
+ async def grammar_endpoint(text: str = Form(...)):
54
+ """
55
+ Menerima text dari FormData dan mengembalikan hasil grammar check.
56
+ """
57
+ # Jalankan di thread pool karena check_grammar adalah fungsi sinkron
58
+ result = await asyncio.to_thread(check_grammar, text)
59
+ return result
60
+
61
+
62
+ # ---------------------------------------------------------
63
+ # Summarization Endpoint
64
+ # ---------------------------------------------------------
65
+ @app.post("/summarize")
66
+ async def summarize_endpoint(text: str = Form(...)):
67
+ """
68
+ Meringkas teks menggunakan AI.
69
+ """
70
+ summary = await summarize_text(text)
71
+ return {"summary": summary}
72
+
73
+
74
+ # ---------------------------------------------------------
75
+ # Export PDF Endpoint
76
+ # ---------------------------------------------------------
77
+ @app.post("/export-pdf")
78
+ async def export_pdf_endpoint(text: str = Form(...)):
79
+ """
80
+ Generate PDF lalu mengirim kembali file. Karena sinkron,
81
+ dijalankan di thread terpisah.
82
+ """
83
+ pdf_path = await asyncio.to_thread(export_pdf, text, "output.pdf")
84
+
85
+ return FileResponse(
86
+ pdf_path,
87
+ media_type="application/pdf",
88
+ filename="output.pdf"
89
+ )
90
+
91
+
92
+ # ---------------------------------------------------------
93
+ # Health Check
94
+ # ---------------------------------------------------------
95
+ @app.get("/")
96
+ async def root():
97
+ return {"message": "NoteBoard AI Backend is running!", "status": "ok"}
98
+
99
+
100
+ if __name__ == "__main__":
101
+ import uvicorn
102
+ port = int(os.environ.get("PORT", 7860))
103
+ uvicorn.run(app, host="0.0.0.0", port=port)
pdf_service.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # services/pdf_service.py
2
+ from reportlab.lib.pagesizes import A4
3
+ from reportlab.pdfgen import canvas
4
+
5
+ def export_pdf(text: str, file_path: str = "output.pdf") -> str:
6
+ """
7
+ Membuat file PDF dari teks (simple, line by line).
8
+ Mengembalikan path file.
9
+ """
10
+ c = canvas.Canvas(file_path, pagesize=A4)
11
+ width, height = A4
12
+ margin_left = 50
13
+ margin_top = 50
14
+ y = height - margin_top
15
+ line_height = 14
16
+
17
+ for line in text.split('\n'):
18
+ # split panjang jadi beberapa baris jika perlu (simple wrap)
19
+ if not line:
20
+ y -= line_height
21
+ else:
22
+ # simple wrapping per 90 karakter (bukan ideal tapi cukup)
23
+ max_chars = 90
24
+ while len(line) > max_chars:
25
+ chunk = line[:max_chars]
26
+ c.drawString(margin_left, y, chunk)
27
+ line = line[max_chars:]
28
+ y -= line_height
29
+ if y < margin_top:
30
+ c.showPage()
31
+ y = height - margin_top
32
+ # terakhir
33
+ c.drawString(margin_left, y, line)
34
+ y -= line_height
35
+
36
+ if y < margin_top:
37
+ c.showPage()
38
+ y = height - margin_top
39
+
40
+ c.save()
41
+ return file_path
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.32.0
3
+
4
+ numpy==1.26.4
5
+ pillow==10.4.0
6
+ requests==2.32.3
7
+ regex==2024.11.6
8
+ tqdm==4.67.1
9
+
10
+ transformers==4.56.1
11
+ surya-ocr==0.17.0
12
+
13
+ reportlab==4.2.5
summarize_service.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+
3
+ MODEL_ID = "panggi/t5-base-indonesian-summarization-cased"
4
+
5
+ tokenizer = None
6
+ model = None
7
+
8
+ def load_model():
9
+ global tokenizer, model
10
+ if tokenizer is None or model is None:
11
+ tokenizer = T5Tokenizer.from_pretrained(MODEL_ID)
12
+ model = T5ForConditionalGeneration.from_pretrained(MODEL_ID)
13
+
14
+
15
+
16
+ def _summarize_sync(text: str):
17
+ load_model()
18
+
19
+ # Encode dengan batas maksimum 512 token
20
+ input_ids = tokenizer.encode(
21
+ text,
22
+ return_tensors="pt",
23
+ truncation=True,
24
+ max_length=512
25
+ )
26
+
27
+ # Parameter diganti agar sama seperti kode yang menurutmu lebih bagus
28
+ summary_ids = model.generate(
29
+ input_ids,
30
+ max_length=250,
31
+ min_length=40,
32
+ num_beams=2,
33
+ repetition_penalty=2.5,
34
+ length_penalty=1.0,
35
+ early_stopping=True,
36
+ no_repeat_ngram_size=2,
37
+ use_cache=True
38
+ )
39
+
40
+ summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
41
+ return summary_text
42
+
43
+
44
+ async def summarize_text(text: str):
45
+ return await asyncio.to_thread(_summarize_sync, text)
surya_ocr.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from PIL import Image
3
+ from io import BytesIO
4
+
5
+ from surya.foundation import FoundationPredictor
6
+ from surya.recognition import RecognitionPredictor
7
+ from surya.detection import DetectionPredictor
8
+
9
+ foundation = None
10
+ recognizer = None
11
+ detector = None
12
+
13
+
14
+ def load_surya():
15
+ global foundation, recognizer, detector
16
+ if foundation is None:
17
+ foundation = FoundationPredictor()
18
+ recognizer = RecognitionPredictor(foundation)
19
+ detector = DetectionPredictor()
20
+
21
+
22
+ def _run_ocr_sync(img):
23
+ load_surya()
24
+ result = recognizer([img], det_predictor=detector)
25
+ result = result[0] if isinstance(result, list) else result
26
+ return "\n".join([l.text for l in result.text_lines])
27
+
28
+
29
+ async def ocr_surya(image_bytes: bytes) -> str:
30
+ try:
31
+ img = Image.open(BytesIO(image_bytes)).convert("RGB")
32
+ text = await asyncio.to_thread(_run_ocr_sync, img)
33
+ return " ".join(text.split())
34
+ except Exception as e:
35
+ print("OCR Error:", e)
36
+ return ""