mkingboi commited on
Commit
6861d19
·
1 Parent(s): b9c79af

Add application file

Browse files
Files changed (5) hide show
  1. Dockerfile +13 -0
  2. app.py +43 -0
  3. database.py +14 -0
  4. requirements.txt +12 -0
  5. scoring.py +79 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ RUN apt-get update && apt-get install -y --no-install-recommends \
3
+ tesseract-ocr \
4
+ default-jre \
5
+ poppler-utils \
6
+ libgl1 \
7
+ && rm -rf /var/lib/apt/lists/*
8
+ WORKDIR /app
9
+ COPY . /app
10
+ RUN pip install --upgrade pip
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+ ENV PYTHONUNBUFFERED=1
13
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from fastapi import FastAPI, Request, HTTPException
4
+ from aiogram import Bot, Dispatcher, types
5
+ from database import Database
6
+ from scoring import analyze_full_submission
7
+
8
+ BOT_TOKEN = os.environ.get('BOT_TOKEN')
9
+ WEBHOOK_URL = os.environ.get('WEBHOOK_URL')
10
+
11
+ if not BOT_TOKEN:
12
+ raise SystemExit("BOT_TOKEN missing")
13
+
14
+ app = FastAPI()
15
+ bot = Bot(BOT_TOKEN)
16
+ dp = Dispatcher()
17
+ db = Database('cefr_bot.db')
18
+
19
+ @dp.message()
20
+ async def handler(msg: types.Message):
21
+ text = msg.text or ""
22
+ submission = {"task1_1": text, "task1_2": "", "task2": ""}
23
+ result = await analyze_full_submission(submission)
24
+ await msg.answer(f"Total: {result['total']}/75 → {result['level']}
25
+
26
+ {result['feedback']}")
27
+
28
+ @app.post("/webhook")
29
+ async def webhook(request: Request):
30
+ data = await request.json()
31
+ update = types.Update(**data)
32
+ await dp.feed_update(bot, update)
33
+ return {"ok": True}
34
+
35
+ @app.on_event("startup")
36
+ async def startup():
37
+ if WEBHOOK_URL:
38
+ url = f"https://api.telegram.org/bot{BOT_TOKEN}/setWebhook?url={WEBHOOK_URL}"
39
+ try:
40
+ print("Setting webhook…")
41
+ print(requests.get(url).text)
42
+ except:
43
+ print("Webhook failed")
database.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3, json
2
+ from datetime import datetime
3
+
4
+ class Database:
5
+ def __init__(self,path):
6
+ self.path=path
7
+ self.ensure()
8
+
9
+ def connect(self): return sqlite3.connect(self.path)
10
+
11
+ def ensure(self):
12
+ c=self.connect(); cur=c.cursor()
13
+ cur.execute("CREATE TABLE IF NOT EXISTS users (user_id INTEGER PRIMARY KEY, full_name TEXT)")
14
+ c.commit(); c.close()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiogram==3.0.0b7
2
+ fastapi==0.95.2
3
+ uvicorn==0.22.0
4
+ pytesseract
5
+ pillow
6
+ transformers>=4.30.0
7
+ sentence-transformers>=2.2.0
8
+ language_tool_python>=2.9.0
9
+ torch>=1.12.0
10
+ nltk
11
+ numpy
12
+ requests
scoring.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, nltk, numpy as np, requests
2
+ from sentence_transformers import SentenceTransformer
3
+ import language_tool_python
4
+ from PIL import Image
5
+ from io import BytesIO
6
+ import pytesseract
7
+ from datetime import datetime
8
+
9
+ try:
10
+ nltk.data.find('tokenizers/punkt')
11
+ except:
12
+ nltk.download('punkt')
13
+
14
+ tool = language_tool_python.LanguageTool('en-US')
15
+ embed = SentenceTransformer('all-MiniLM-L6-v2')
16
+ HF_TOKEN = os.getenv("HF_TOKEN")
17
+ HF_FEEDBACK_MODEL_URL = os.getenv("HF_FEEDBACK_MODEL_URL","https://api-inference.huggingface.co/models/google/flan-t5-large")
18
+
19
+ def ocr_image_from_filebytes(b):
20
+ return pytesseract.image_to_string(Image.open(BytesIO(b)).convert("RGB"))
21
+
22
+ def count_errors(t):
23
+ return len(tool.check(t))
24
+
25
+ def lex_div(t):
26
+ w=nltk.word_tokenize(t.lower());
27
+ return len(set([x for x in w if x.isalpha()]))/len(w) if w else 0
28
+
29
+ def avg_sent_len(t):
30
+ s=nltk.sent_tokenize(t)
31
+ return sum(len(nltk.word_tokenize(x)) for x in s)/len(s) if s else 0
32
+
33
+ def coherence(t):
34
+ s=nltk.sent_tokenize(t)
35
+ if len(s)<2: return 0.5
36
+ e=embed.encode(s)
37
+ sims=[ np.dot(e[i],e[i+1])/(np.linalg.norm(e[i])*np.linalg.norm(e[i+1])+1e-9) for i in range(len(e)-1)]
38
+ return float(sum(sims)/len(sims))
39
+
40
+ def norm25(v,a,b):
41
+ v=max(a,min(b,v))
42
+ return int(round((v-a)/(b-a)*25)) if b>a else 0
43
+
44
+ def score_text(t):
45
+ e=count_errors(t)
46
+ words=nltk.word_tokenize(t)
47
+ err_rate=e/(len(words) or 1)*100
48
+ grammar=norm25(max(0,30-err_rate),0,30)
49
+ ttr=lex_div(t)
50
+ asl=avg_sent_len(t)
51
+ vocab=int(round(0.7*norm25(ttr,0,0.6)+0.3*norm25(min(asl,40),3,20)))
52
+ coh=norm25((coherence(t)+1)/2,0,1)
53
+ return {"grammar":grammar,"vocab":vocab,"coherence":coh,"errors_count":e}
54
+
55
+ async def analyze_full_submission(sub):
56
+ texts=[sub.get("task1_1",""),sub.get("task1_2",""),sub.get("task2","")]
57
+ analyses={}
58
+ combined=[]
59
+ for k,t in zip(["task1_1","task1_2","task2"],texts):
60
+ if not t.strip():
61
+ analyses[k]={"skipped":True}
62
+ else:
63
+ a=score_text(t)
64
+ analyses[k]=a
65
+ combined.append(t)
66
+
67
+ weights={"task1_1":0.1,"task1_2":0.4,"task2":0.5}
68
+ total=0
69
+ for k,w in weights.items():
70
+ c=analyses[k]
71
+ if "skipped" in c: continue
72
+ total+= (c["grammar"]+c["vocab"]+c["coherence"])*w
73
+ total=int(round(total))
74
+ level="Below B1"
75
+ if total>=65: level="C1"
76
+ elif total>=51: level="B2"
77
+ elif total>=38: level="B1"
78
+
79
+ return {"analysis":analyses,"total":total,"level":level,"feedback":"Basic feedback.","time":datetime.utcnow().isoformat()}