Merge pull request #20 from cyberalertnepal/PujanDev
Browse files- .gitignore +1 -0
- Dockerfile +17 -0
- README.md +9 -0
- app.py +2 -1
- docs/api_endpoints.md +75 -0
- docs/deployment.md +105 -0
- docs/functions.md +53 -0
- docs/nestjs_integration.md +82 -0
- docs/security.md +9 -0
- docs/setup.md +23 -0
- docs/structure.md +54 -0
- features/nepali_text_classifier/__init__.py +0 -0
- features/nepali_text_classifier/controller.py +131 -0
- features/nepali_text_classifier/inferencer.py +23 -0
- features/nepali_text_classifier/model_loader.py +54 -0
- features/nepali_text_classifier/preprocess.py +38 -0
- features/nepali_text_classifier/routes.py +45 -0
- features/text_classifier/controller.py +16 -14
- features/text_classifier/model_loader.py +0 -5
- readme.md +21 -387
- requirements.txt +3 -2
.gitignore
CHANGED
|
@@ -59,3 +59,4 @@ model/
|
|
| 59 |
models/.gitattributes #<-- This line can stay if you only want to ignore that file, not the whole folder
|
| 60 |
|
| 61 |
todo.md
|
|
|
|
|
|
| 59 |
models/.gitattributes #<-- This line can stay if you only want to ignore that file, not the whole folder
|
| 60 |
|
| 61 |
todo.md
|
| 62 |
+
np_text_model
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
RUN useradd -m -u 1000 user
|
| 7 |
+
USER user
|
| 8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 14 |
+
RUN python -m spacy download en_core_web_sm || echo "Failed to download model"
|
| 15 |
+
|
| 16 |
+
COPY --chown=user . /app
|
| 17 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Ai-Checker
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from slowapi.errors import RateLimitExceeded
|
|
| 5 |
from slowapi.util import get_remote_address
|
| 6 |
from fastapi.responses import JSONResponse
|
| 7 |
from features.text_classifier.routes import router as text_classifier_router
|
|
|
|
| 8 |
from config import ACCESS_RATE
|
| 9 |
import requests
|
| 10 |
limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
|
|
@@ -25,7 +26,7 @@ app.add_middleware(SlowAPIMiddleware)
|
|
| 25 |
|
| 26 |
# Include your routes
|
| 27 |
app.include_router(text_classifier_router, prefix="/text")
|
| 28 |
-
|
| 29 |
@app.get("/")
|
| 30 |
@limiter.limit(ACCESS_RATE)
|
| 31 |
async def root(request: Request):
|
|
|
|
| 5 |
from slowapi.util import get_remote_address
|
| 6 |
from fastapi.responses import JSONResponse
|
| 7 |
from features.text_classifier.routes import router as text_classifier_router
|
| 8 |
+
from features.nepali_text_classifier.routes import router as nepali_text_classifier_router
|
| 9 |
from config import ACCESS_RATE
|
| 10 |
import requests
|
| 11 |
limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
|
|
|
|
| 26 |
|
| 27 |
# Include your routes
|
| 28 |
app.include_router(text_classifier_router, prefix="/text")
|
| 29 |
+
app.include_router(nepali_text_classifier_router,prefix="/NP")
|
| 30 |
@app.get("/")
|
| 31 |
@limiter.limit(ACCESS_RATE)
|
| 32 |
async def root(request: Request):
|
docs/api_endpoints.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧩 API Endpoints
|
| 2 |
+
|
| 3 |
+
### English (GPT-2) - `/text/`
|
| 4 |
+
|
| 5 |
+
| Endpoint | Method | Description |
|
| 6 |
+
| --------------------------------- | ------ | ----------------------------------------- |
|
| 7 |
+
| `/text/analyse` | POST | Classify raw English text |
|
| 8 |
+
| `/text/analyse-sentences` | POST | Sentence-by-sentence breakdown |
|
| 9 |
+
| `/text/analyse-sentance-file` | POST | Upload file, per-sentence breakdown |
|
| 10 |
+
| `/text/upload` | POST | Upload file for overall classification |
|
| 11 |
+
| `/text/health` | GET | Health check |
|
| 12 |
+
|
| 13 |
+
#### Example: Classify English text
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
curl -X POST http://localhost:8000/text/analyse \
|
| 17 |
+
-H "Authorization: Bearer <SECRET_TOKEN>" \
|
| 18 |
+
-H "Content-Type: application/json" \
|
| 19 |
+
-d '{"text": "This is a sample text for analysis."}'
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
**Response:**
|
| 23 |
+
```json
|
| 24 |
+
{
|
| 25 |
+
"result": "AI-generated",
|
| 26 |
+
"perplexity": 55.67,
|
| 27 |
+
"ai_likelihood": 66.6
|
| 28 |
+
}
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
#### Example: File upload
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
curl -X POST http://localhost:8000/text/upload \
|
| 35 |
+
-H "Authorization: Bearer <SECRET_TOKEN>" \
|
| 36 |
+
-F 'file=@yourfile.txt;type=text/plain'
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
### Nepali (SentencePiece) - `/NP/`
|
| 42 |
+
|
| 43 |
+
| Endpoint | Method | Description |
|
| 44 |
+
| --------------------------------- | ------ | ----------------------------------------- |
|
| 45 |
+
| `/NP/analyse` | POST | Classify Nepali text |
|
| 46 |
+
| `/NP/analyse-sentences` | POST | Sentence-by-sentence breakdown |
|
| 47 |
+
| `/NP/upload` | POST | Upload Nepali PDF for classification |
|
| 48 |
+
| `/NP/file-sentences-analyse` | POST | PDF upload, per-sentence breakdown |
|
| 49 |
+
| `/NP/health` | GET | Health check |
|
| 50 |
+
|
| 51 |
+
#### Example: Nepali text classification
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
curl -X POST http://localhost:8000/NP/analyse \
|
| 55 |
+
-H "Authorization: Bearer <SECRET_TOKEN>" \
|
| 56 |
+
-H "Content-Type: application/json" \
|
| 57 |
+
-d '{"text": "यो उदाहरण वाक्य हो।"}'
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
**Response:**
|
| 61 |
+
```json
|
| 62 |
+
{
|
| 63 |
+
"label": "Human",
|
| 64 |
+
"confidence": 98.6
|
| 65 |
+
}
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
#### Example: Nepali PDF upload
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
curl -X POST http://localhost:8000/NP/upload \
|
| 72 |
+
-H "Authorization: Bearer <SECRET_TOKEN>" \
|
| 73 |
+
-F 'file=@NepaliText.pdf;type=application/pdf'
|
| 74 |
+
```
|
| 75 |
+
|
docs/deployment.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Deployment
|
| 3 |
+
|
| 4 |
+
This project is containerized and deployed on **Hugging Face Spaces** using a custom `Dockerfile`. This guide explains the structure of the Dockerfile and key considerations for deploying FastAPI apps on Spaces with Docker SDK.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## 📦 Base Image
|
| 9 |
+
|
| 10 |
+
```dockerfile
|
| 11 |
+
FROM python:3.9
|
| 12 |
+
````
|
| 13 |
+
|
| 14 |
+
We use the official Python 3.9 image for compatibility and stability across most Python libraries and tools.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## 👤 Create a Non-Root User
|
| 19 |
+
|
| 20 |
+
```dockerfile
|
| 21 |
+
RUN useradd -m -u 1000 user
|
| 22 |
+
USER user
|
| 23 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
* Hugging Face Spaces **requires** that containers run as a non-root user with UID `1000`.
|
| 27 |
+
* We also prepend the user's local binary path to `PATH` for Python package accessibility.
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## 🗂️ Set Working Directory
|
| 32 |
+
|
| 33 |
+
```dockerfile
|
| 34 |
+
WORKDIR /app
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
All application files will reside under `/app` for consistency and clarity.
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
## 📋 Install Dependencies
|
| 42 |
+
|
| 43 |
+
```dockerfile
|
| 44 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 45 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
* Copies the dependency list with correct file ownership.
|
| 49 |
+
* Uses `--no-cache-dir` to reduce image size.
|
| 50 |
+
* Ensures the latest compatible versions are installed.
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## 🔡 Download Language Model (Optional)
|
| 55 |
+
|
| 56 |
+
```dockerfile
|
| 57 |
+
RUN python -m spacy download en_core_web_sm || echo "Failed to download model"
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
* Downloads the small English NLP model required by SpaCy.
|
| 61 |
+
* Uses `|| echo ...` to prevent build failure if the download fails (optional safeguard).
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## 📁 Copy Project Files
|
| 66 |
+
|
| 67 |
+
```dockerfile
|
| 68 |
+
COPY --chown=user . /app
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
Copies the entire project source into the container, setting correct ownership for Hugging Face's user-based execution.
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## 🌐 Start the FastAPI Server
|
| 76 |
+
|
| 77 |
+
```dockerfile
|
| 78 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
* Launches the FastAPI app using `uvicorn`.
|
| 82 |
+
* **Port 7860 is mandatory** for Docker-based Hugging Face Spaces deployments.
|
| 83 |
+
* `app:app` refers to the `FastAPI()` instance in `app.py`.
|
| 84 |
+
|
| 85 |
+
---
|
| 86 |
+
|
| 87 |
+
## ✅ Deployment Checklist
|
| 88 |
+
|
| 89 |
+
* [x] Ensure your main file is named `app.py` or adjust `CMD` accordingly.
|
| 90 |
+
* [x] All dependencies should be listed in `requirements.txt`.
|
| 91 |
+
* [x] If using models like SpaCy, verify they are downloaded or bundled.
|
| 92 |
+
* [x] Test your Dockerfile locally with `docker build` before pushing to Hugging Face.
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## 📚 References
|
| 97 |
+
|
| 98 |
+
* Hugging Face Docs: [Spaces Docker SDK](https://huggingface.co/docs/hub/spaces-sdks-docker)
|
| 99 |
+
* Uvicorn Docs: [https://www.uvicorn.org/](https://www.uvicorn.org/)
|
| 100 |
+
* SpaCy Models: [https://spacy.io/models](https://spacy.io/models)
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
Happy deploying!
|
| 105 |
+
**P.S.** Try not to break stuff. 😅
|
docs/functions.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Major Functions used
|
| 2 |
+
|
| 3 |
+
## in Text Classifier (`features/text_classifier/` and `features/text_classifier/`)
|
| 4 |
+
|
| 5 |
+
- **`load_model()`**
|
| 6 |
+
Loads the GPT-2 model and tokenizer from the specified directory paths.
|
| 7 |
+
|
| 8 |
+
- **`lifespan()`**
|
| 9 |
+
Manages the application lifecycle. Initializes the model at startup and handles cleanup on shutdown.
|
| 10 |
+
|
| 11 |
+
- **`classify_text_sync()`**
|
| 12 |
+
Synchronously tokenizes input text and predicts using the GPT-2 model. Returns classification and perplexity.
|
| 13 |
+
|
| 14 |
+
- **`classify_text()`**
|
| 15 |
+
Asynchronously runs `classify_text_sync()` in a thread pool for non-blocking text classification.
|
| 16 |
+
|
| 17 |
+
- **`analyze_text()`**
|
| 18 |
+
**POST** endpoint: Accepts text input, classifies it using `classify_text()`, and returns the result with perplexity.
|
| 19 |
+
|
| 20 |
+
- **`health()`**
|
| 21 |
+
**GET** endpoint: Simple health check for API liveness.
|
| 22 |
+
|
| 23 |
+
- **`parse_docx()`, `parse_pdf()`, `parse_txt()`**
|
| 24 |
+
Utilities to extract and convert `.docx`, `.pdf`, and `.txt` file contents to plain text.
|
| 25 |
+
|
| 26 |
+
- **`warmup()`**
|
| 27 |
+
Downloads the model repository and initializes the model/tokenizer using `load_model()`.
|
| 28 |
+
|
| 29 |
+
- **`download_model_repo()`**
|
| 30 |
+
Downloads the model files from the designated `MODEL` folder.
|
| 31 |
+
|
| 32 |
+
- **`get_model_tokenizer()`**
|
| 33 |
+
Checks if the model already exists; if not, downloads it—otherwise, loads the cached model.
|
| 34 |
+
|
| 35 |
+
- **`handle_file_upload()`**
|
| 36 |
+
Handles file uploads from the `/upload` route. Extracts text, classifies, and returns results.
|
| 37 |
+
|
| 38 |
+
- **`extract_file_contents()`**
|
| 39 |
+
Extracts and returns plain text from uploaded files (PDF, DOCX, TXT).
|
| 40 |
+
|
| 41 |
+
- **`handle_file_sentence()`**
|
| 42 |
+
Processes file uploads by analyzing each sentence (under 10,000 chars) before classification.
|
| 43 |
+
|
| 44 |
+
- **`handle_sentence_level_analysis()`**
|
| 45 |
+
Checks/strips each sentence, then computes AI/human likelihood for each.
|
| 46 |
+
|
| 47 |
+
- **`analyze_sentences()`**
|
| 48 |
+
Splits paragraphs into sentences, classifies each, and returns all results.
|
| 49 |
+
|
| 50 |
+
- **`analyze_sentence_file()`**
|
| 51 |
+
Like `handle_file_sentence()`—analyzes sentences in uploaded files.
|
| 52 |
+
|
| 53 |
+
## for image_classifier
|
docs/nestjs_integration.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Nestjs + fastapi
|
| 2 |
+
|
| 3 |
+
You can easily call this API from a NestJS microservice.
|
| 4 |
+
|
| 5 |
+
**.env**
|
| 6 |
+
```env
|
| 7 |
+
FASTAPI_BASE_URL=http://localhost:8000
|
| 8 |
+
SECRET_TOKEN=your_secret_token_here
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
**fastapi.service.ts**
|
| 12 |
+
|
| 13 |
+
```typescript
|
| 14 |
+
import { Injectable } from "@nestjs/common";
|
| 15 |
+
import { HttpService } from "@nestjs/axios";
|
| 16 |
+
import { ConfigService } from "@nestjs/config";
|
| 17 |
+
import { firstValueFrom } from "rxjs";
|
| 18 |
+
|
| 19 |
+
@Injectable()
|
| 20 |
+
export class FastAPIService {
|
| 21 |
+
constructor(
|
| 22 |
+
private http: HttpService,
|
| 23 |
+
private config: ConfigService,
|
| 24 |
+
) {}
|
| 25 |
+
|
| 26 |
+
async analyzeText(text: string) {
|
| 27 |
+
const url = `${this.config.get("FASTAPI_BASE_URL")}/text/analyse`;
|
| 28 |
+
const token = this.config.get("SECRET_TOKEN");
|
| 29 |
+
|
| 30 |
+
const response = await firstValueFrom(
|
| 31 |
+
this.http.post(
|
| 32 |
+
url,
|
| 33 |
+
{ text },
|
| 34 |
+
{
|
| 35 |
+
headers: {
|
| 36 |
+
Authorization: `Bearer ${token}`,
|
| 37 |
+
},
|
| 38 |
+
},
|
| 39 |
+
),
|
| 40 |
+
);
|
| 41 |
+
|
| 42 |
+
return response.data;
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
**app.module.ts**
|
| 48 |
+
```typescript
|
| 49 |
+
import { Module } from "@nestjs/common";
|
| 50 |
+
import { ConfigModule } from "@nestjs/config";
|
| 51 |
+
import { HttpModule } from "@nestjs/axios";
|
| 52 |
+
import { AppController } from "./app.controller";
|
| 53 |
+
import { FastAPIService } from "./fastapi.service";
|
| 54 |
+
|
| 55 |
+
@Module({
|
| 56 |
+
imports: [ConfigModule.forRoot(), HttpModule],
|
| 57 |
+
controllers: [AppController],
|
| 58 |
+
providers: [FastAPIService],
|
| 59 |
+
})
|
| 60 |
+
export class AppModule {}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**app.controller.ts**
|
| 64 |
+
```typescript
|
| 65 |
+
import { Body, Controller, Post, Get } from '@nestjs/common';
|
| 66 |
+
import { FastAPIService } from './fastapi.service';
|
| 67 |
+
|
| 68 |
+
@Controller()
|
| 69 |
+
export class AppController {
|
| 70 |
+
constructor(private readonly fastapiService: FastAPIService) {}
|
| 71 |
+
|
| 72 |
+
@Post('analyze-text')
|
| 73 |
+
async callFastAPI(@Body('text') text: string) {
|
| 74 |
+
return this.fastapiService.analyzeText(text);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
@Get()
|
| 78 |
+
getHello(): string {
|
| 79 |
+
return 'NestJS is connected to FastAPI';
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
```
|
docs/security.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Security: Bearer Token Auth
|
| 2 |
+
|
| 3 |
+
All endpoints require authentication via Bearer token:
|
| 4 |
+
|
| 5 |
+
- Set `SECRET_TOKEN` in `.env`
|
| 6 |
+
- Add header: `Authorization: Bearer <SECRET_TOKEN>`
|
| 7 |
+
|
| 8 |
+
Unauthorized requests receive `403 Forbidden`.
|
| 9 |
+
|
docs/setup.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Setup & Installation
|
| 2 |
+
|
| 3 |
+
## 1. Clone the Repository
|
| 4 |
+
```bash
|
| 5 |
+
git clone https://github.com/cyberalertnepal/aiapi
|
| 6 |
+
cd aiapi
|
| 7 |
+
```
|
| 8 |
+
|
| 9 |
+
## 2. Install Dependencies
|
| 10 |
+
```bash
|
| 11 |
+
pip install -r requirements.txt
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
## 3. Configure Environment
|
| 15 |
+
Create a `.env` file:
|
| 16 |
+
```env
|
| 17 |
+
SECRET_TOKEN=your_secret_token_here
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## 4. Run the API
|
| 21 |
+
```bash
|
| 22 |
+
uvicorn app:app --host 0.0.0.0 --port 8000
|
| 23 |
+
```
|
docs/structure.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## 🏗️ Project Structure
|
| 2 |
+
|
| 3 |
+
```
|
| 4 |
+
├── app.py # Main FastAPI app entrypoint
|
| 5 |
+
├── config.py # Configuration loader (.env, settings)
|
| 6 |
+
├── features/
|
| 7 |
+
│ ├── text_classifier/ # English (GPT-2) classifier
|
| 8 |
+
│ │ ├── controller.py
|
| 9 |
+
│ │ ├── inferencer.py
|
| 10 |
+
│ │ ├── model_loader.py
|
| 11 |
+
│ │ ├── preprocess.py
|
| 12 |
+
│ │ └── routes.py
|
| 13 |
+
│ └── nepali_text_classifier/ # Nepali (sentencepiece) classifier
|
| 14 |
+
│ ├── controller.py
|
| 15 |
+
│ ├── inferencer.py
|
| 16 |
+
│ ├── model_loader.py
|
| 17 |
+
│ ├── preprocess.py
|
| 18 |
+
│ └── routes.py
|
| 19 |
+
├── np_text_model/ # Nepali model artifacts (auto-downloaded)
|
| 20 |
+
│ ├── classifier/
|
| 21 |
+
│ │ └── sentencepiece.bpe.model
|
| 22 |
+
│ └── model_95_acc.pth
|
| 23 |
+
├── models/ # English GPT-2 model/tokenizer (auto-downloaded)
|
| 24 |
+
│ ├── merges.txt
|
| 25 |
+
│ ├── tokenizer.json
|
| 26 |
+
│ └── model_weights.pth
|
| 27 |
+
├── Dockerfile # Container build config
|
| 28 |
+
├── Procfile # Deployment entrypoint (for PaaS)
|
| 29 |
+
├── requirements.txt # Python dependencies
|
| 30 |
+
├── README.md
|
| 31 |
+
├── Docs # documents
|
| 32 |
+
└── .env # Secret token(s), environment config
|
| 33 |
+
```
|
| 34 |
+
### 🌟 Key Files and Their Roles
|
| 35 |
+
|
| 36 |
+
- **`app.py`**: Entry point initializing FastAPI app and routes.
|
| 37 |
+
- **`Procfile`**: Tells Railway (or similar platforms) how to run the program.
|
| 38 |
+
- **`requirements.txt`**: Tracks all Python dependencies for the project.
|
| 39 |
+
- **`__init__.py`**: Package initializer for the root module and submodules.
|
| 40 |
+
- **`features/text_classifier/`**
|
| 41 |
+
- **`controller.py`**: Handles logic between routes and the model.
|
| 42 |
+
- **`inferencer.py`**: Runs inference and returns predictions as well as file system
|
| 43 |
+
utilities.
|
| 44 |
+
- **`features/NP/`**
|
| 45 |
+
- **`controller.py`**: Handles logic between routes and the model.
|
| 46 |
+
- **`inferencer.py`**: Runs inference and returns predictions as well as file system
|
| 47 |
+
utilities.
|
| 48 |
+
- **`model_loader.py`**: Loads the ML model and tokenizer.
|
| 49 |
+
- **`preprocess.py`**: Prepares input text for the model.
|
| 50 |
+
- **`routes.py`**: Defines API routes for text classification.
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
-[Main](../README.md)
|
features/nepali_text_classifier/__init__.py
ADDED
|
File without changes
|
features/nepali_text_classifier/controller.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from fastapi import HTTPException, UploadFile, status, Depends
|
| 4 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from features.nepali_text_classifier.inferencer import classify_text
|
| 8 |
+
from features.nepali_text_classifier.preprocess import *
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
security = HTTPBearer()
|
| 12 |
+
|
| 13 |
+
def contains_english(text: str) -> bool:
|
| 14 |
+
# Remove escape characters
|
| 15 |
+
cleaned = text.replace("\n", "").replace("\t", "")
|
| 16 |
+
return bool(re.search(r'[a-zA-Z]', cleaned))
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
| 20 |
+
token = credentials.credentials
|
| 21 |
+
expected_token = os.getenv("MY_SECRET_TOKEN")
|
| 22 |
+
if token != expected_token:
|
| 23 |
+
raise HTTPException(
|
| 24 |
+
status_code=status.HTTP_403_FORBIDDEN,
|
| 25 |
+
detail="Invalid or expired token"
|
| 26 |
+
)
|
| 27 |
+
return token
|
| 28 |
+
|
| 29 |
+
async def nepali_text_analysis(text: str):
|
| 30 |
+
end_symbol_for_NP_text(text)
|
| 31 |
+
words = text.split()
|
| 32 |
+
if len(words) < 10:
|
| 33 |
+
raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
|
| 34 |
+
if len(text) > 10000:
|
| 35 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 36 |
+
|
| 37 |
+
result = await asyncio.to_thread(classify_text, text)
|
| 38 |
+
|
| 39 |
+
return result
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
#Extract text form uploaded files(.docx,.pdf,.txt)
|
| 43 |
+
async def extract_file_contents(file:UploadFile)-> str:
|
| 44 |
+
content = await file.read()
|
| 45 |
+
file_stream = BytesIO(content)
|
| 46 |
+
if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
| 47 |
+
return parse_docx(file_stream)
|
| 48 |
+
elif file.content_type =="application/pdf":
|
| 49 |
+
return parse_pdf(file_stream)
|
| 50 |
+
elif file.content_type =="text/plain":
|
| 51 |
+
return parse_txt(file_stream)
|
| 52 |
+
else:
|
| 53 |
+
raise HTTPException(status_code=415,detail="Invalid file type. Only .docx,.pdf and .txt are allowed")
|
| 54 |
+
|
| 55 |
+
async def handle_file_upload(file: UploadFile):
|
| 56 |
+
try:
|
| 57 |
+
file_contents = await extract_file_contents(file)
|
| 58 |
+
end_symbol_for_NP_text(file_contents)
|
| 59 |
+
if len(file_contents) > 10000:
|
| 60 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 61 |
+
|
| 62 |
+
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 63 |
+
if not cleaned_text:
|
| 64 |
+
raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
|
| 65 |
+
|
| 66 |
+
result = await asyncio.to_thread(classify_text, cleaned_text)
|
| 67 |
+
return result
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logging.error(f"Error processing file: {e}")
|
| 70 |
+
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
async def handle_sentence_level_analysis(text: str):
|
| 75 |
+
text = text.strip()
|
| 76 |
+
if len(text) > 10000:
|
| 77 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 78 |
+
|
| 79 |
+
end_symbol_for_NP_text(text)
|
| 80 |
+
|
| 81 |
+
# Split text into sentences
|
| 82 |
+
sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
|
| 83 |
+
|
| 84 |
+
results = []
|
| 85 |
+
for sentence in sentences:
|
| 86 |
+
end_symbol_for_NP_text(sentence)
|
| 87 |
+
result = await asyncio.to_thread(classify_text, sentence)
|
| 88 |
+
results.append({
|
| 89 |
+
"text": sentence,
|
| 90 |
+
"result": result["label"],
|
| 91 |
+
"likelihood": result["confidence"]
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
return {"analysis": results}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
async def handle_file_sentence(file:UploadFile):
|
| 98 |
+
try:
|
| 99 |
+
file_contents = await extract_file_contents(file)
|
| 100 |
+
if len(file_contents) > 10000:
|
| 101 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 102 |
+
|
| 103 |
+
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 104 |
+
if not cleaned_text:
|
| 105 |
+
raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
|
| 106 |
+
# Ensure text ends with danda so last sentence is included
|
| 107 |
+
|
| 108 |
+
# Split text into sentences
|
| 109 |
+
sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
|
| 110 |
+
|
| 111 |
+
results = []
|
| 112 |
+
for sentence in sentences:
|
| 113 |
+
end_symbol_for_NP_text(sentence)
|
| 114 |
+
|
| 115 |
+
result = await asyncio.to_thread(classify_text, sentence)
|
| 116 |
+
results.append({
|
| 117 |
+
"text": sentence,
|
| 118 |
+
"result": result["label"],
|
| 119 |
+
"likelihood": result["confidence"]
|
| 120 |
+
})
|
| 121 |
+
|
| 122 |
+
return {"analysis": results}
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logging.error(f"Error processing file: {e}")
|
| 126 |
+
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def classify(text: str):
|
| 130 |
+
return classify_text(text)
|
| 131 |
+
|
features/nepali_text_classifier/inferencer.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from .model_loader import get_model_tokenizer
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def classify_text(text: str):
|
| 9 |
+
model, tokenizer = get_model_tokenizer()
|
| 10 |
+
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
|
| 11 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 12 |
+
|
| 13 |
+
with torch.no_grad():
|
| 14 |
+
outputs = model(**inputs)
|
| 15 |
+
logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits
|
| 16 |
+
probs = F.softmax(logits, dim=1)
|
| 17 |
+
pred = torch.argmax(probs, dim=1).item()
|
| 18 |
+
prob_percent = probs[0][pred].item() * 100
|
| 19 |
+
|
| 20 |
+
return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
features/nepali_text_classifier/model_loader.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
import logging
|
| 7 |
+
from huggingface_hub import snapshot_download
|
| 8 |
+
from transformers import AutoTokenizer, AutoModel
|
| 9 |
+
|
| 10 |
+
# Configs
|
| 11 |
+
REPO_ID = "Pujan-Dev/Nepali-AI-VS-HUMAN"
|
| 12 |
+
BASE_DIR = "./np_text_model"
|
| 13 |
+
TOKENIZER_DIR = os.path.join(BASE_DIR, "classifier") # <- update this to match your uploaded folder
|
| 14 |
+
WEIGHTS_PATH = os.path.join(BASE_DIR, "model_95_acc.pth") # <- change to match actual uploaded weight
|
| 15 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 16 |
+
|
| 17 |
+
# Define model class
|
| 18 |
+
class XLMRClassifier(nn.Module):
|
| 19 |
+
def __init__(self):
|
| 20 |
+
super(XLMRClassifier, self).__init__()
|
| 21 |
+
self.bert = AutoModel.from_pretrained("xlm-roberta-base")
|
| 22 |
+
self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
|
| 23 |
+
|
| 24 |
+
def forward(self, input_ids, attention_mask):
|
| 25 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
| 26 |
+
cls_output = outputs.last_hidden_state[:, 0, :]
|
| 27 |
+
return self.classifier(cls_output)
|
| 28 |
+
|
| 29 |
+
# Globals for caching
|
| 30 |
+
_model = None
|
| 31 |
+
_tokenizer = None
|
| 32 |
+
|
| 33 |
+
def download_model_repo():
|
| 34 |
+
if os.path.exists(BASE_DIR) and os.path.isdir(BASE_DIR):
|
| 35 |
+
logging.info("Model already downloaded.")
|
| 36 |
+
return
|
| 37 |
+
snapshot_path = snapshot_download(repo_id=REPO_ID)
|
| 38 |
+
os.makedirs(BASE_DIR, exist_ok=True)
|
| 39 |
+
shutil.copytree(snapshot_path, BASE_DIR, dirs_exist_ok=True)
|
| 40 |
+
|
| 41 |
+
def load_model():
|
| 42 |
+
download_model_repo()
|
| 43 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
|
| 44 |
+
model = XLMRClassifier().to(device)
|
| 45 |
+
model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
|
| 46 |
+
model.eval()
|
| 47 |
+
return model, tokenizer
|
| 48 |
+
|
| 49 |
+
def get_model_tokenizer():
|
| 50 |
+
global _model, _tokenizer
|
| 51 |
+
if _model is None or _tokenizer is None:
|
| 52 |
+
_model, _tokenizer = load_model()
|
| 53 |
+
return _model, _tokenizer
|
| 54 |
+
|
features/nepali_text_classifier/preprocess.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import docx
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
import logging
|
| 5 |
+
from fastapi import HTTPException
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def parse_docx(file: BytesIO):
|
| 9 |
+
doc = docx.Document(file)
|
| 10 |
+
text = ""
|
| 11 |
+
for para in doc.paragraphs:
|
| 12 |
+
text += para.text + "\n"
|
| 13 |
+
return text
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def parse_pdf(file: BytesIO):
|
| 17 |
+
try:
|
| 18 |
+
doc = fitz.open(stream=file, filetype="pdf")
|
| 19 |
+
text = ""
|
| 20 |
+
for page_num in range(doc.page_count):
|
| 21 |
+
page = doc.load_page(page_num)
|
| 22 |
+
text += page.get_text()
|
| 23 |
+
return text
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logging.error(f"Error while processing PDF: {str(e)}")
|
| 26 |
+
raise HTTPException(
|
| 27 |
+
status_code=500, detail="Error processing PDF file")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def parse_txt(file: BytesIO):
|
| 31 |
+
return file.read().decode("utf-8")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def end_symbol_for_NP_text(text):
|
| 35 |
+
if not text.endswith("।"):
|
| 36 |
+
text += "।"
|
| 37 |
+
|
| 38 |
+
|
features/nepali_text_classifier/routes.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from slowapi import Limiter
|
| 2 |
+
from config import ACCESS_RATE
|
| 3 |
+
from .controller import handle_file_sentence, handle_sentence_level_analysis, nepali_text_analysis
|
| 4 |
+
from .inferencer import classify_text
|
| 5 |
+
from fastapi import APIRouter, File, Request, Depends, HTTPException, UploadFile
|
| 6 |
+
from fastapi.security import HTTPBearer
|
| 7 |
+
from slowapi import Limiter
|
| 8 |
+
from slowapi.util import get_remote_address
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
from .controller import handle_file_upload
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
limiter = Limiter(key_func=get_remote_address)
|
| 13 |
+
security = HTTPBearer()
|
| 14 |
+
|
| 15 |
+
# Input schema
|
| 16 |
+
class TextInput(BaseModel):
|
| 17 |
+
text: str
|
| 18 |
+
|
| 19 |
+
@router.post("/analyse")
|
| 20 |
+
@limiter.limit(ACCESS_RATE)
|
| 21 |
+
async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
|
| 22 |
+
result = classify_text(data.text)
|
| 23 |
+
return result
|
| 24 |
+
|
| 25 |
+
@router.post("/upload")
|
| 26 |
+
@limiter.limit(ACCESS_RATE)
|
| 27 |
+
async def upload_file(request:Request,file:UploadFile=File(...),token:str=Depends(security)):
|
| 28 |
+
return await handle_file_upload(file)
|
| 29 |
+
|
| 30 |
+
@router.post("/analyse-sentences")
|
| 31 |
+
@limiter.limit(ACCESS_RATE)
|
| 32 |
+
async def upload_file(request:Request,data:TextInput,token:str=Depends(security)):
|
| 33 |
+
return await handle_sentence_level_analysis(data.text)
|
| 34 |
+
|
| 35 |
+
@router.post("/file-sentences-analyse")
|
| 36 |
+
@limiter.limit(ACCESS_RATE)
|
| 37 |
+
async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(security)):
|
| 38 |
+
return await handle_file_sentence(file)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@router.get("/health")
|
| 42 |
+
@limiter.limit(ACCESS_RATE)
|
| 43 |
+
def health(request: Request):
|
| 44 |
+
return {"status": "ok"}
|
| 45 |
+
|
features/text_classifier/controller.py
CHANGED
|
@@ -5,12 +5,12 @@ from io import BytesIO
|
|
| 5 |
|
| 6 |
from fastapi import HTTPException, UploadFile, status, Depends
|
| 7 |
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 8 |
-
from nltk.tokenize import sent_tokenize
|
| 9 |
|
| 10 |
from .inferencer import classify_text
|
| 11 |
from .preprocess import parse_docx, parse_pdf, parse_txt
|
| 12 |
-
|
| 13 |
security = HTTPBearer()
|
|
|
|
| 14 |
|
| 15 |
# Verify Bearer token from Authorization header
|
| 16 |
async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
|
@@ -52,7 +52,7 @@ async def extract_file_contents(file: UploadFile) -> str:
|
|
| 52 |
else:
|
| 53 |
raise HTTPException(
|
| 54 |
status_code=415,
|
| 55 |
-
detail="Invalid file type. Only .docx, .pdf
|
| 56 |
)
|
| 57 |
|
| 58 |
# Classify text from uploaded file
|
|
@@ -60,7 +60,7 @@ async def handle_file_upload(file: UploadFile):
|
|
| 60 |
try:
|
| 61 |
file_contents = await extract_file_contents(file)
|
| 62 |
if len(file_contents) > 10000:
|
| 63 |
-
|
| 64 |
|
| 65 |
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 66 |
if not cleaned_text:
|
|
@@ -77,18 +77,22 @@ async def handle_file_upload(file: UploadFile):
|
|
| 77 |
logging.error(f"Error processing file: {e}")
|
| 78 |
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 79 |
|
| 80 |
-
|
|
|
|
| 81 |
async def handle_sentence_level_analysis(text: str):
|
| 82 |
text = text.strip()
|
| 83 |
-
if
|
| 84 |
-
text+="."
|
|
|
|
| 85 |
if len(text) > 10000:
|
| 86 |
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
results = []
|
| 90 |
for sentence in sentences:
|
| 91 |
-
if not sentence
|
| 92 |
continue
|
| 93 |
label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
|
| 94 |
results.append({
|
|
@@ -97,14 +101,13 @@ async def handle_sentence_level_analysis(text: str):
|
|
| 97 |
"perplexity": round(perplexity, 2),
|
| 98 |
"ai_likelihood": ai_likelihood
|
| 99 |
})
|
| 100 |
-
return {"analysis": results}
|
| 101 |
|
| 102 |
-
# Analyze each sentence from uploaded file
|
| 103 |
async def handle_file_sentence(file: UploadFile):
|
| 104 |
try:
|
| 105 |
file_contents = await extract_file_contents(file)
|
| 106 |
if len(file_contents) > 10000:
|
| 107 |
-
|
| 108 |
|
| 109 |
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 110 |
if not cleaned_text:
|
|
@@ -119,7 +122,6 @@ async def handle_file_sentence(file: UploadFile):
|
|
| 119 |
logging.error(f"Error processing file: {e}")
|
| 120 |
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 121 |
|
| 122 |
-
# Optional synchronous helper function
|
| 123 |
def classify(text: str):
|
| 124 |
return classify_text(text)
|
| 125 |
|
|
|
|
| 5 |
|
| 6 |
from fastapi import HTTPException, UploadFile, status, Depends
|
| 7 |
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
|
|
|
| 8 |
|
| 9 |
from .inferencer import classify_text
|
| 10 |
from .preprocess import parse_docx, parse_pdf, parse_txt
|
| 11 |
+
import spacy
|
| 12 |
security = HTTPBearer()
|
| 13 |
+
nlp = spacy.load("en_core_web_sm")
|
| 14 |
|
| 15 |
# Verify Bearer token from Authorization header
|
| 16 |
async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
|
|
|
| 52 |
else:
|
| 53 |
raise HTTPException(
|
| 54 |
status_code=415,
|
| 55 |
+
detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
|
| 56 |
)
|
| 57 |
|
| 58 |
# Classify text from uploaded file
|
|
|
|
| 60 |
try:
|
| 61 |
file_contents = await extract_file_contents(file)
|
| 62 |
if len(file_contents) > 10000:
|
| 63 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 64 |
|
| 65 |
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 66 |
if not cleaned_text:
|
|
|
|
| 77 |
logging.error(f"Error processing file: {e}")
|
| 78 |
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 79 |
|
| 80 |
+
|
| 81 |
+
|
| 82 |
async def handle_sentence_level_analysis(text: str):
|
| 83 |
text = text.strip()
|
| 84 |
+
if not text.endswith("."):
|
| 85 |
+
text += "."
|
| 86 |
+
|
| 87 |
if len(text) > 10000:
|
| 88 |
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 89 |
+
|
| 90 |
+
doc = nlp(text)
|
| 91 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
| 92 |
+
|
| 93 |
results = []
|
| 94 |
for sentence in sentences:
|
| 95 |
+
if not sentence:
|
| 96 |
continue
|
| 97 |
label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
|
| 98 |
results.append({
|
|
|
|
| 101 |
"perplexity": round(perplexity, 2),
|
| 102 |
"ai_likelihood": ai_likelihood
|
| 103 |
})
|
|
|
|
| 104 |
|
| 105 |
+
return {"analysis": results}# Analyze each sentence from uploaded file
|
| 106 |
async def handle_file_sentence(file: UploadFile):
|
| 107 |
try:
|
| 108 |
file_contents = await extract_file_contents(file)
|
| 109 |
if len(file_contents) > 10000:
|
| 110 |
+
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
|
| 111 |
|
| 112 |
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
|
| 113 |
if not cleaned_text:
|
|
|
|
| 122 |
logging.error(f"Error processing file: {e}")
|
| 123 |
raise HTTPException(status_code=500, detail="Error processing the file")
|
| 124 |
|
|
|
|
| 125 |
def classify(text: str):
|
| 126 |
return classify_text(text)
|
| 127 |
|
features/text_classifier/model_loader.py
CHANGED
|
@@ -5,7 +5,6 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
|
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
import torch
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
-
import nltk
|
| 9 |
load_dotenv()
|
| 10 |
REPO_ID = "Pujan-Dev/AI-Text-Detector"
|
| 11 |
MODEL_DIR = "./models"
|
|
@@ -19,10 +18,6 @@ _model, _tokenizer = None, None
|
|
| 19 |
def warmup():
|
| 20 |
global _model, _tokenizer
|
| 21 |
# Ensure punkt is available
|
| 22 |
-
nltk.download("punkt")
|
| 23 |
-
|
| 24 |
-
nltk.download('punkt_tab')
|
| 25 |
-
|
| 26 |
download_model_repo()
|
| 27 |
_model, _tokenizer = load_model()
|
| 28 |
logging.info("Its ready")
|
|
|
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
import torch
|
| 7 |
from dotenv import load_dotenv
|
|
|
|
| 8 |
load_dotenv()
|
| 9 |
REPO_ID = "Pujan-Dev/AI-Text-Detector"
|
| 10 |
MODEL_DIR = "./models"
|
|
|
|
| 18 |
def warmup():
|
| 19 |
global _model, _tokenizer
|
| 20 |
# Ensure punkt is available
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
download_model_repo()
|
| 22 |
_model, _tokenizer = load_model()
|
| 23 |
logging.info("Its ready")
|
readme.md
CHANGED
|
@@ -1,401 +1,35 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
##
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
**NOTE: IF YOU HAVE DONE ANY CHANGES DON'NT FORGOT TO PUT IT IN THE REQUIREMENTS.TXT USING `bash pip freeze > requirements.txt `**
|
| 15 |
-
|
| 16 |
-
---
|
| 17 |
-
### Files STructure
|
| 18 |
-
|
| 19 |
-
```
|
| 20 |
-
├── app.py
|
| 21 |
-
├── features
|
| 22 |
-
│ └── text_classifier
|
| 23 |
-
│ ├── controller.py
|
| 24 |
-
│ ├── inferencer.py
|
| 25 |
-
│ ├── __init__.py
|
| 26 |
-
│ ├── model_loader.py
|
| 27 |
-
│ ├── preprocess.py
|
| 28 |
-
│ └── routes.py
|
| 29 |
-
├── __init__.py
|
| 30 |
-
├── Procfile
|
| 31 |
-
├── readme.md
|
| 32 |
-
└── requirements.txt
|
| 33 |
-
```
|
| 34 |
-
**`app.py`**: Entry point initializing FastAPI app and routes
|
| 35 |
-
**`Procfile`**: Tells Railway how to run the program
|
| 36 |
-
**`requirements.txt`**:Have all the packages that we use in our project
|
| 37 |
-
**`__init__.py`** : Package initializer for the root module
|
| 38 |
-
**FOLDER :features/text_classifier**
|
| 39 |
-
**`controller.py`** :Handles logic between routes and model
|
| 40 |
-
**`inferencer.py`** : Runs inference and returns predictions as well as files system
|
| 41 |
-
**`__init__.py`** :Initializes the module as a package
|
| 42 |
-
**`model_loader.py`** : Loads the ML model and tokenizer
|
| 43 |
-
**`preprocess.py`** :Prepares input text for the model
|
| 44 |
-
**`routes.py`** :Defines API routes for text classification
|
| 45 |
-
|
| 46 |
-
### **Functions**
|
| 47 |
-
|
| 48 |
-
1. **`load_model()`**
|
| 49 |
-
Loads the GPT-2 model and tokenizer from the specified directory paths.
|
| 50 |
-
|
| 51 |
-
2. **`lifespan()`**
|
| 52 |
-
Manages the application lifecycle. It initializes the model at startup and performs cleanup during shutdown.
|
| 53 |
-
|
| 54 |
-
3. **`classify_text_sync()`**
|
| 55 |
-
Synchronously tokenizes the input text and performs classification using the GPT-2 model. Returns both the classification result and perplexity score.
|
| 56 |
-
|
| 57 |
-
4. **`classify_text()`**
|
| 58 |
-
Asynchronously runs `classify_text_sync()` in a thread pool for non-blocking text classification.
|
| 59 |
-
|
| 60 |
-
5. **`analyze_text()`**
|
| 61 |
-
**POST** endpoint: Accepts text input, classifies it using `classify_text()`, and returns the result along with perplexity.
|
| 62 |
-
|
| 63 |
-
6. **`health()`**
|
| 64 |
-
**GET** endpoint: Performs a simple health check to confirm the API is operational.
|
| 65 |
-
|
| 66 |
-
7. **`parse_docx()`, `parse_pdf()`, `parse_txt()`**
|
| 67 |
-
Utility functions to extract and convert the contents of `.docx`, `.pdf`, and `.txt` files into plain text for classification.
|
| 68 |
-
|
| 69 |
-
8. **`warmup()`**
|
| 70 |
-
Downloads the model repository and initializes the model and tokenizer using the `load_model()` function.
|
| 71 |
-
|
| 72 |
-
9. **`download_model_repo()`**
|
| 73 |
-
Handles downloading the model files from the designated `MODEL` folder.
|
| 74 |
-
|
| 75 |
-
10. **`get_model_tokenizer()`**
|
| 76 |
-
Similar to `warmup()`, but includes a check to see if the model already exists. If not, it downloads the model; otherwise, it uses the previously downloaded one.
|
| 77 |
-
|
| 78 |
-
11. **`handle_file_upload()`**
|
| 79 |
-
Manages file uploads from the `/upload` route. Extracts text from the uploaded file, classifies it, and returns the results.
|
| 80 |
-
|
| 81 |
-
12. **`extract_file_contents()`**
|
| 82 |
-
Extracts and returns plain text content from uploaded files (e.g., PDF, DOCX, TXT).
|
| 83 |
-
|
| 84 |
-
13. **`handle_file_sentence()`**
|
| 85 |
-
Processes uploaded files by analyzing each sentence. Ensures the total file text is under 10,000 characters before classification.
|
| 86 |
-
|
| 87 |
-
14. **`handle_sentence_level_analysis()`**
|
| 88 |
-
Strips and checks each sentence’s length, then evaluates the likelihood of AI vs. human generation for each sentence.
|
| 89 |
-
|
| 90 |
-
15. **`analyze_sentences()`**
|
| 91 |
-
Divides long paragraphs into individual sentences, classifies each one, and returns a list of their classification results.
|
| 92 |
-
|
| 93 |
-
16. **`analyze_sentence_file()`**
|
| 94 |
-
A route function that analyzes sentences in uploaded files, similar to `handle_file_sentence()`.
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
---
|
| 98 |
-
|
| 99 |
-
### **Code Overview**
|
| 100 |
-
|
| 101 |
-
### **Running and Load Balancing:**
|
| 102 |
-
|
| 103 |
-
To run the app in production with load balancing:
|
| 104 |
-
|
| 105 |
-
```bash
|
| 106 |
-
uvicorn app:app --host 0.0.0.0 --port 8000
|
| 107 |
-
```
|
| 108 |
-
|
| 109 |
-
This command launches the FastAPI app.
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
### **Endpoints**
|
| 113 |
-
|
| 114 |
-
#### 1. **`/text/analyze`**
|
| 115 |
-
|
| 116 |
-
- **Method:** `POST`
|
| 117 |
-
- **Description:** Classifies whether the text is AI-generated or human-written.
|
| 118 |
-
- **Request:**
|
| 119 |
-
```json
|
| 120 |
-
{ "text": "sample text" }
|
| 121 |
-
```
|
| 122 |
-
- **Response:**
|
| 123 |
-
```json
|
| 124 |
-
{ "result": "AI-generated", "perplexity": 55.67,"ai_likelihood":66.6%}
|
| 125 |
-
```
|
| 126 |
-
|
| 127 |
-
#### 2. **`/health`**
|
| 128 |
-
|
| 129 |
-
- **Method:** `GET`
|
| 130 |
-
- **Description:** Returns the status of the API.
|
| 131 |
-
- **Response:**
|
| 132 |
-
```json
|
| 133 |
-
{ "status": "ok" }
|
| 134 |
-
```
|
| 135 |
-
#### 3. **`/text/upload`**
|
| 136 |
-
- **Method:** `POST`
|
| 137 |
-
- **Description:** Takes the files and check the contains inside and returns the results
|
| 138 |
-
- **Request:** Files
|
| 139 |
-
|
| 140 |
-
- **Response:**
|
| 141 |
-
```json
|
| 142 |
-
{ "result": "AI-generated", "perplexity": 55.67,"ai_likelihood":66.6%}
|
| 143 |
-
```
|
| 144 |
-
#### 4. **`/text/analyze_sentence_file`**
|
| 145 |
-
- **Method:** `POST`
|
| 146 |
-
- **Description:** Takes the files and check the contains inside and returns the results
|
| 147 |
-
- **Request:** Files
|
| 148 |
-
|
| 149 |
-
- **Response:**
|
| 150 |
-
```json
|
| 151 |
-
{
|
| 152 |
-
"content": "Artificial Intelligence (AI) and Machine Learning (ML) are rapidly transforming the way we \ninteract with technology. AI refers to the broader concept of machines being able to carry out \ntasks in a way that we would consider \"smart,\" while ML is a subset of AI that focuses on the \ndevelopment of algorithms that allow computers to learn from and make decisions based on \ndata. These technologies are behind innovations such as voice assistants, recommendation \nsystems, self-driving cars, and medical diagnosis tools. By analyzing large amounts of data, \nAI and ML can identify patterns, make predictions, and continuously improve their \nperformance over time, making them essential tools in modern industries ranging from \nhealthcare and finance to education and entertainment. \n \n",
|
| 153 |
-
"analysis": [
|
| 154 |
-
{
|
| 155 |
-
"sentence": "Artificial Intelligence (AI) and Machine Learning (ML) are rapidly transforming the way we interact with technology.",
|
| 156 |
-
"label": "AI-generated",
|
| 157 |
-
"perplexity": 8.17,
|
| 158 |
-
"ai_likelihood": 100
|
| 159 |
-
},
|
| 160 |
-
{
|
| 161 |
-
"sentence": "AI refers to the broader concept of machines being able to carry out tasks in a way that we would consider \"smart,\" while ML is a subset of AI that focuses on the development of algorithms that allow computers to learn from and make decisions based on data.",
|
| 162 |
-
"label": "AI-generated",
|
| 163 |
-
"perplexity": 19.34,
|
| 164 |
-
"ai_likelihood": 89.62
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"sentence": "These technologies are behind innovations such as voice assistants, recommendation systems, self-driving cars, and medical diagnosis tools.",
|
| 168 |
-
"label": "AI-generated",
|
| 169 |
-
"perplexity": 40.31,
|
| 170 |
-
"ai_likelihood": 66.32
|
| 171 |
-
},
|
| 172 |
-
{
|
| 173 |
-
"sentence": "By analyzing large amounts of data, AI and ML can identify patterns, make predictions, and continuously improve their performance over time, making them essential tools in modern industries ranging from healthcare and finance to education and entertainment.",
|
| 174 |
-
"label": "AI-generated",
|
| 175 |
-
"perplexity": 26.15,
|
| 176 |
-
"ai_likelihood": 82.05
|
| 177 |
-
}
|
| 178 |
-
]
|
| 179 |
-
}```
|
| 180 |
-
|
| 181 |
-
#### 5. **`/text/analyze_sentences`**
|
| 182 |
-
- **Method:** `POST`
|
| 183 |
-
- **Description:** Takes the text and check the contains inside and returns the results
|
| 184 |
-
- **Request:**
|
| 185 |
-
```json
|
| 186 |
-
{
|
| 187 |
-
"text": "This is an test text. This is an another Text "
|
| 188 |
-
}
|
| 189 |
-
```
|
| 190 |
-
|
| 191 |
-
- **Response:**
|
| 192 |
-
```json
|
| 193 |
-
{
|
| 194 |
-
"analysis": [
|
| 195 |
-
{
|
| 196 |
-
"sentence": "This is an test text.",
|
| 197 |
-
"label": "Human-written",
|
| 198 |
-
"perplexity": 510.28,
|
| 199 |
-
"ai_likelihood": 0
|
| 200 |
-
},
|
| 201 |
-
{
|
| 202 |
-
"sentence": "This is an another Text",
|
| 203 |
-
"label": "Human-written",
|
| 204 |
-
"perplexity": 3926.05,
|
| 205 |
-
"ai_likelihood": 0
|
| 206 |
-
}
|
| 207 |
-
]
|
| 208 |
-
}```
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
---
|
| 212 |
-
|
| 213 |
-
### **Running the API**
|
| 214 |
-
|
| 215 |
-
Start the server with:
|
| 216 |
|
|
|
|
| 217 |
```bash
|
| 218 |
-
uvicorn app:app --host 0.0.0.0 --port 8000
|
| 219 |
```
|
|
|
|
| 220 |
|
| 221 |
-
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
You can test the FastAPI endpoint using `curl` like this:
|
| 226 |
-
|
| 227 |
-
```bash
|
| 228 |
-
curl -X POST https://can-org-canspace.hf.space/analyze \
|
| 229 |
-
-H "Authorization: Bearer SECRET_CODE" \
|
| 230 |
-
-H "Content-Type: application/json" \
|
| 231 |
-
-d '{"text": "This is a sample sentence for analysis."}'
|
| 232 |
-
```
|
| 233 |
-
|
| 234 |
-
- The `-H "Authorization: Bearer SECRET_CODE"` part is used to simulate the **handshake**.
|
| 235 |
-
- FastAPI checks this token against the one loaded from the `.env` file.
|
| 236 |
-
- If the token matches, the request is accepted and processed.
|
| 237 |
-
- Otherwise, it responds with a `403 Unauthorized` error.
|
| 238 |
|
| 239 |
---
|
| 240 |
|
| 241 |
-
##
|
| 242 |
-
|
| 243 |
-
- **Swagger UI:** `https://can-org-canspace.hf.space/docs` -> `/docs`
|
| 244 |
-
- **ReDoc:** `https://can-org-canspace.hf.space/redoc` -> `/redoc`
|
| 245 |
-
|
| 246 |
-
### **🔐 Handshake Mechanism**
|
| 247 |
-
|
| 248 |
-
In this part, we're implementing a simple handshake to verify that the request is coming from a trusted source (e.g., our NestJS server). Here's how it works:
|
| 249 |
-
|
| 250 |
-
- We load a secret token from the `.env` file.
|
| 251 |
-
- When a request is made to the FastAPI server, we extract the `Authorization` header and compare it with our expected secret token.
|
| 252 |
-
- If the token does **not** match, we immediately return a **403 Forbidden** response with the message `"Unauthorized"`.
|
| 253 |
-
- If the token **does** match, we allow the request to proceed to the next step.
|
| 254 |
-
|
| 255 |
-
The verification function looks like this:
|
| 256 |
-
|
| 257 |
-
```python
|
| 258 |
-
def verify_token(auth: str):
|
| 259 |
-
if auth != f"Bearer {EXPECTED_TOKEN}":
|
| 260 |
-
raise HTTPException(status_code=403, detail="Unauthorized")
|
| 261 |
-
```
|
| 262 |
-
|
| 263 |
-
This provides a basic but effective layer of security to prevent unauthorized access to the API.
|
| 264 |
-
|
| 265 |
-
### **Implement it with NEST.js**
|
| 266 |
-
|
| 267 |
-
NOTE: Make an micro service in NEST.JS and implement it there and call it from app.controller.ts
|
| 268 |
-
|
| 269 |
-
in fastapi.service.ts file what we have done is
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
``
|
| 274 |
-
|
| 275 |
-
├── src/
|
| 276 |
-
│ ├── app.controller.ts
|
| 277 |
-
│ ├── app.module.ts
|
| 278 |
-
│ └── fastapi.service.ts
|
| 279 |
-
├── .env
|
| 280 |
-
|
| 281 |
-
```
|
| 282 |
|
| 283 |
---
|
| 284 |
|
| 285 |
-
### Step-by-Step Setup
|
| 286 |
-
|
| 287 |
-
#### 1. `.env`
|
| 288 |
-
|
| 289 |
-
Create a `.env` file at the root with the following:
|
| 290 |
-
|
| 291 |
-
```environment
|
| 292 |
-
FASTAPI_BASE_URL=https://can-org-canspace.hf.space/
|
| 293 |
-
SECRET_TOKEN="SECRET_CODE_TOKEN"
|
| 294 |
-
```
|
| 295 |
-
|
| 296 |
-
#### 2. `fastapi.service.ts`
|
| 297 |
-
|
| 298 |
-
```javascript
|
| 299 |
-
// src/fastapi.service.ts
|
| 300 |
-
import { Injectable } from "@nestjs/common";
|
| 301 |
-
import { HttpService } from "@nestjs/axios";
|
| 302 |
-
import { ConfigService } from "@nestjs/config";
|
| 303 |
-
import { firstValueFrom } from "rxjs";
|
| 304 |
-
|
| 305 |
-
@Injectable()
|
| 306 |
-
export class FastAPIService {
|
| 307 |
-
constructor(
|
| 308 |
-
private http: HttpService,
|
| 309 |
-
private config: ConfigService,
|
| 310 |
-
) {}
|
| 311 |
-
|
| 312 |
-
async analyzeText(text: string) {
|
| 313 |
-
const url = `${this.config.get("FASTAPI_BASE_URL")}/analyze`;
|
| 314 |
-
const token = this.config.get("SECRET_TOKEN");
|
| 315 |
-
|
| 316 |
-
const response = await firstValueFrom(
|
| 317 |
-
this.http.post(
|
| 318 |
-
url,
|
| 319 |
-
{ text },
|
| 320 |
-
{
|
| 321 |
-
headers: {
|
| 322 |
-
Authorization: `Bearer ${token}`,
|
| 323 |
-
},
|
| 324 |
-
},
|
| 325 |
-
),
|
| 326 |
-
);
|
| 327 |
-
|
| 328 |
-
return response.data;
|
| 329 |
-
}
|
| 330 |
-
}
|
| 331 |
-
```
|
| 332 |
-
|
| 333 |
-
#### 3. `app.module.ts`
|
| 334 |
-
|
| 335 |
-
```javascript
|
| 336 |
-
// src/app.module.ts
|
| 337 |
-
import { Module } from "@nestjs/common";
|
| 338 |
-
import { ConfigModule } from "@nestjs/config";
|
| 339 |
-
import { HttpModule } from "@nestjs/axios";
|
| 340 |
-
import { AppController } from "./app.controller";
|
| 341 |
-
import { FastAPIService } from "./fastapi.service";
|
| 342 |
-
|
| 343 |
-
@Module({
|
| 344 |
-
imports: [ConfigModule.forRoot(), HttpModule],
|
| 345 |
-
controllers: [AppController],
|
| 346 |
-
providers: [FastAPIService],
|
| 347 |
-
})
|
| 348 |
-
export class AppModule {}
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
---
|
| 352 |
-
|
| 353 |
-
#### 4. `app.controller.ts`
|
| 354 |
-
|
| 355 |
-
```javascript
|
| 356 |
-
// src/app.controller.ts
|
| 357 |
-
import { Body, Controller, Post, Get, Query } from '@nestjs/common';
|
| 358 |
-
import { FastAPIService } from './fastapi.service';
|
| 359 |
-
|
| 360 |
-
@Controller()
|
| 361 |
-
export class AppController {
|
| 362 |
-
constructor(private readonly fastapiService: FastAPIService) {}
|
| 363 |
-
|
| 364 |
-
@Post('analyze-text')
|
| 365 |
-
async callFastAPI(@Body('text') text: string) {
|
| 366 |
-
return this.fastapiService.analyzeText(text);
|
| 367 |
-
}
|
| 368 |
-
|
| 369 |
-
@Get()
|
| 370 |
-
getHello(): string {
|
| 371 |
-
return 'NestJS is connected to FastAPI ';
|
| 372 |
-
}
|
| 373 |
-
}
|
| 374 |
-
```
|
| 375 |
-
|
| 376 |
-
### 🚀 How to Run
|
| 377 |
-
|
| 378 |
-
Run the server of flask and nest.js:
|
| 379 |
-
|
| 380 |
-
- for nest.js
|
| 381 |
-
```bash
|
| 382 |
-
npm run start
|
| 383 |
-
```
|
| 384 |
-
- for Fastapi
|
| 385 |
-
|
| 386 |
-
```bash
|
| 387 |
-
uvicorn app:app --reload
|
| 388 |
-
```
|
| 389 |
-
|
| 390 |
-
Make sure your FastAPI service is running at `http://localhost:8000`.
|
| 391 |
-
|
| 392 |
-
### Test with CURL
|
| 393 |
-
http://localhost:3000/-> Server of nest.js
|
| 394 |
-
```bash
|
| 395 |
-
curl -X POST http://localhost:3000/analyze-text \
|
| 396 |
-
-H 'Content-Type: application/json' \
|
| 397 |
-
-d '{"text": "This is a test input"}'
|
| 398 |
-
```
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
|
|
|
| 1 |
+
# 🚀 FastAPI AI Detector
|
| 2 |
|
| 3 |
+
A production-ready FastAPI app for detecting AI vs. human-written text in English and Nepali. It uses GPT-2 and SentencePiece-based models, with Bearer token security.
|
| 4 |
|
| 5 |
+
## 📂 Documentation
|
| 6 |
|
| 7 |
+
- [Project Structure](docs/structure.md)
|
| 8 |
+
- [API Endpoints](docs/api_endpoints.md)
|
| 9 |
+
- [Setup & Installation](docs/setup.md)
|
| 10 |
+
- [Deployment](docs/deployment.md)
|
| 11 |
+
- [Security](docs/security.md)
|
| 12 |
+
- [NestJS Integration](docs/nestjs_integration.md)
|
| 13 |
+
- [Core Functions](docs/functions.md)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
## ⚡ Quick Start
|
| 16 |
```bash
|
| 17 |
+
uvicorn app:app --host 0.0.0.0 --port 8000
|
| 18 |
```
|
| 19 |
+
## 🚀 Deployment
|
| 20 |
|
| 21 |
+
- **Local**: Use `uvicorn` as above.
|
| 22 |
+
- **Railway/Heroku**: Use the provided `Procfile`.
|
| 23 |
+
- **Hugging Face Spaces**: Use the `Dockerfile` for container deployment.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
---
|
| 26 |
|
| 27 |
+
## 💡 Tips
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
- **Model files auto-download at first start** if not found.
|
| 30 |
+
- **Keep `requirements.txt` up-to-date** after adding dependencies.
|
| 31 |
+
- **All endpoints require the correct `Authorization` header**.
|
| 32 |
+
- **For security**: Avoid committing `.env` to public repos.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
---
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -7,6 +7,7 @@ python-dotenv
|
|
| 7 |
python-docx
|
| 8 |
pydantic
|
| 9 |
PyMuPDF
|
| 10 |
-
nltk
|
| 11 |
python-multipart
|
| 12 |
-
slowapi
|
|
|
|
|
|
|
|
|
| 7 |
python-docx
|
| 8 |
pydantic
|
| 9 |
PyMuPDF
|
|
|
|
| 10 |
python-multipart
|
| 11 |
+
slowapi
|
| 12 |
+
spacy
|
| 13 |
+
nltk
|