Add application file
Browse files- Dockerfile +13 -0
- app.py +43 -0
- database.py +14 -0
- requirements.txt +12 -0
- scoring.py +79 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 3 |
+
tesseract-ocr \
|
| 4 |
+
default-jre \
|
| 5 |
+
poppler-utils \
|
| 6 |
+
libgl1 \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
COPY . /app
|
| 10 |
+
RUN pip install --upgrade pip
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
ENV PYTHONUNBUFFERED=1
|
| 13 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from fastapi import FastAPI, Request, HTTPException
|
| 4 |
+
from aiogram import Bot, Dispatcher, types
|
| 5 |
+
from database import Database
|
| 6 |
+
from scoring import analyze_full_submission
|
| 7 |
+
|
| 8 |
+
BOT_TOKEN = os.environ.get('BOT_TOKEN')
|
| 9 |
+
WEBHOOK_URL = os.environ.get('WEBHOOK_URL')
|
| 10 |
+
|
| 11 |
+
if not BOT_TOKEN:
|
| 12 |
+
raise SystemExit("BOT_TOKEN missing")
|
| 13 |
+
|
| 14 |
+
app = FastAPI()
|
| 15 |
+
bot = Bot(BOT_TOKEN)
|
| 16 |
+
dp = Dispatcher()
|
| 17 |
+
db = Database('cefr_bot.db')
|
| 18 |
+
|
| 19 |
+
@dp.message()
|
| 20 |
+
async def handler(msg: types.Message):
|
| 21 |
+
text = msg.text or ""
|
| 22 |
+
submission = {"task1_1": text, "task1_2": "", "task2": ""}
|
| 23 |
+
result = await analyze_full_submission(submission)
|
| 24 |
+
await msg.answer(f"Total: {result['total']}/75 → {result['level']}
|
| 25 |
+
|
| 26 |
+
{result['feedback']}")
|
| 27 |
+
|
| 28 |
+
@app.post("/webhook")
|
| 29 |
+
async def webhook(request: Request):
|
| 30 |
+
data = await request.json()
|
| 31 |
+
update = types.Update(**data)
|
| 32 |
+
await dp.feed_update(bot, update)
|
| 33 |
+
return {"ok": True}
|
| 34 |
+
|
| 35 |
+
@app.on_event("startup")
|
| 36 |
+
async def startup():
|
| 37 |
+
if WEBHOOK_URL:
|
| 38 |
+
url = f"https://api.telegram.org/bot{BOT_TOKEN}/setWebhook?url={WEBHOOK_URL}"
|
| 39 |
+
try:
|
| 40 |
+
print("Setting webhook…")
|
| 41 |
+
print(requests.get(url).text)
|
| 42 |
+
except:
|
| 43 |
+
print("Webhook failed")
|
database.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3, json
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
class Database:
|
| 5 |
+
def __init__(self,path):
|
| 6 |
+
self.path=path
|
| 7 |
+
self.ensure()
|
| 8 |
+
|
| 9 |
+
def connect(self): return sqlite3.connect(self.path)
|
| 10 |
+
|
| 11 |
+
def ensure(self):
|
| 12 |
+
c=self.connect(); cur=c.cursor()
|
| 13 |
+
cur.execute("CREATE TABLE IF NOT EXISTS users (user_id INTEGER PRIMARY KEY, full_name TEXT)")
|
| 14 |
+
c.commit(); c.close()
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiogram==3.0.0b7
|
| 2 |
+
fastapi==0.95.2
|
| 3 |
+
uvicorn==0.22.0
|
| 4 |
+
pytesseract
|
| 5 |
+
pillow
|
| 6 |
+
transformers>=4.30.0
|
| 7 |
+
sentence-transformers>=2.2.0
|
| 8 |
+
language_tool_python>=2.9.0
|
| 9 |
+
torch>=1.12.0
|
| 10 |
+
nltk
|
| 11 |
+
numpy
|
| 12 |
+
requests
|
scoring.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, nltk, numpy as np, requests
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
import language_tool_python
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
import pytesseract
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
nltk.data.find('tokenizers/punkt')
|
| 11 |
+
except:
|
| 12 |
+
nltk.download('punkt')
|
| 13 |
+
|
| 14 |
+
tool = language_tool_python.LanguageTool('en-US')
|
| 15 |
+
embed = SentenceTransformer('all-MiniLM-L6-v2')
|
| 16 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
+
HF_FEEDBACK_MODEL_URL = os.getenv("HF_FEEDBACK_MODEL_URL","https://api-inference.huggingface.co/models/google/flan-t5-large")
|
| 18 |
+
|
| 19 |
+
def ocr_image_from_filebytes(b):
|
| 20 |
+
return pytesseract.image_to_string(Image.open(BytesIO(b)).convert("RGB"))
|
| 21 |
+
|
| 22 |
+
def count_errors(t):
|
| 23 |
+
return len(tool.check(t))
|
| 24 |
+
|
| 25 |
+
def lex_div(t):
|
| 26 |
+
w=nltk.word_tokenize(t.lower());
|
| 27 |
+
return len(set([x for x in w if x.isalpha()]))/len(w) if w else 0
|
| 28 |
+
|
| 29 |
+
def avg_sent_len(t):
|
| 30 |
+
s=nltk.sent_tokenize(t)
|
| 31 |
+
return sum(len(nltk.word_tokenize(x)) for x in s)/len(s) if s else 0
|
| 32 |
+
|
| 33 |
+
def coherence(t):
|
| 34 |
+
s=nltk.sent_tokenize(t)
|
| 35 |
+
if len(s)<2: return 0.5
|
| 36 |
+
e=embed.encode(s)
|
| 37 |
+
sims=[ np.dot(e[i],e[i+1])/(np.linalg.norm(e[i])*np.linalg.norm(e[i+1])+1e-9) for i in range(len(e)-1)]
|
| 38 |
+
return float(sum(sims)/len(sims))
|
| 39 |
+
|
| 40 |
+
def norm25(v,a,b):
|
| 41 |
+
v=max(a,min(b,v))
|
| 42 |
+
return int(round((v-a)/(b-a)*25)) if b>a else 0
|
| 43 |
+
|
| 44 |
+
def score_text(t):
|
| 45 |
+
e=count_errors(t)
|
| 46 |
+
words=nltk.word_tokenize(t)
|
| 47 |
+
err_rate=e/(len(words) or 1)*100
|
| 48 |
+
grammar=norm25(max(0,30-err_rate),0,30)
|
| 49 |
+
ttr=lex_div(t)
|
| 50 |
+
asl=avg_sent_len(t)
|
| 51 |
+
vocab=int(round(0.7*norm25(ttr,0,0.6)+0.3*norm25(min(asl,40),3,20)))
|
| 52 |
+
coh=norm25((coherence(t)+1)/2,0,1)
|
| 53 |
+
return {"grammar":grammar,"vocab":vocab,"coherence":coh,"errors_count":e}
|
| 54 |
+
|
| 55 |
+
async def analyze_full_submission(sub):
|
| 56 |
+
texts=[sub.get("task1_1",""),sub.get("task1_2",""),sub.get("task2","")]
|
| 57 |
+
analyses={}
|
| 58 |
+
combined=[]
|
| 59 |
+
for k,t in zip(["task1_1","task1_2","task2"],texts):
|
| 60 |
+
if not t.strip():
|
| 61 |
+
analyses[k]={"skipped":True}
|
| 62 |
+
else:
|
| 63 |
+
a=score_text(t)
|
| 64 |
+
analyses[k]=a
|
| 65 |
+
combined.append(t)
|
| 66 |
+
|
| 67 |
+
weights={"task1_1":0.1,"task1_2":0.4,"task2":0.5}
|
| 68 |
+
total=0
|
| 69 |
+
for k,w in weights.items():
|
| 70 |
+
c=analyses[k]
|
| 71 |
+
if "skipped" in c: continue
|
| 72 |
+
total+= (c["grammar"]+c["vocab"]+c["coherence"])*w
|
| 73 |
+
total=int(round(total))
|
| 74 |
+
level="Below B1"
|
| 75 |
+
if total>=65: level="C1"
|
| 76 |
+
elif total>=51: level="B2"
|
| 77 |
+
elif total>=38: level="B1"
|
| 78 |
+
|
| 79 |
+
return {"analysis":analyses,"total":total,"level":level,"feedback":"Basic feedback.","time":datetime.utcnow().isoformat()}
|