Pujan-Dev commited on
Commit
4ecc57f
·
verified ·
1 Parent(s): 34bf657

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -66
app.py CHANGED
@@ -2,42 +2,36 @@ from fastapi import FastAPI, HTTPException, Depends, UploadFile, File
2
  from fastapi.security import HTTPBearer
3
  from pydantic import BaseModel
4
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
5
- from dotenv import load_dotenv
6
- from contextlib import asynccontextmanager
7
-
8
  import torch
9
- import asyncio
10
- import math
11
  import os
12
- import docx
13
- import fitz # PyMuPDF
14
  import logging
15
  from io import BytesIO
16
-
17
- # Setup logging
18
- logging.basicConfig(level=logging.DEBUG)
19
 
20
  # Load environment variables
 
21
  load_dotenv()
 
22
  SECRET_TOKEN = os.getenv("SECRET_TOKEN")
 
23
 
24
- # File Paths
25
- MODEL_PATH = "./Ai-Text-Detector/model"
26
- WEIGHTS_PATH = "./Ai-Text-Detector/model_weights.pth"
27
 
28
- # Global model and tokenizer
29
- model = None
30
- tokenizer = None
31
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
 
33
- # Security
34
- bearer_scheme = HTTPBearer()
 
35
 
36
- # Text input schema
37
- class TextInput(BaseModel):
38
- text: str
39
 
40
- # Load model and tokenizer
41
  def load_model():
42
  global model, tokenizer
43
  try:
@@ -47,32 +41,31 @@ def load_model():
47
  model_instance.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
48
  model_instance.to(device)
49
  model_instance.eval()
50
- model = model_instance
 
51
  except Exception as e:
 
52
  raise RuntimeError(f"Error loading model: {str(e)}")
53
 
54
- # Lifespan event to load model on startup
55
  @asynccontextmanager
56
  async def lifespan(app: FastAPI):
57
- load_model()
58
  yield
59
 
60
- # FastAPI app instance
61
  app = FastAPI(lifespan=lifespan)
62
 
63
- # Classification logic
 
 
 
 
64
  def classify_text(text: str):
65
  if not model or not tokenizer:
66
  raise RuntimeError("Model or tokenizer not loaded.")
67
 
68
- inputs = tokenizer(
69
- text,
70
- return_tensors="pt",
71
- truncation=True,
72
- padding="max_length",
73
- max_length=512
74
- )
75
-
76
  input_ids = inputs["input_ids"].to(device)
77
  attention_mask = inputs["attention_mask"].to(device)
78
 
@@ -88,17 +81,16 @@ def classify_text(text: str):
88
  else:
89
  return "Human-written", perplexity
90
 
91
- # Score converter (optional utility)
92
- def Perplexity_Converter(perplexity):
93
- return max(0, min(100, 100 - math.log2(perplexity) * 10))
94
-
95
- # Analyze text directly
96
  @app.post("/analyze")
97
  async def analyze_text(data: TextInput, token: str = Depends(bearer_scheme)):
 
98
  if token.credentials != SECRET_TOKEN:
99
  raise HTTPException(status_code=401, detail="Invalid token")
100
 
101
  text = data.text.strip()
 
 
102
  if not text:
103
  raise HTTPException(status_code=400, detail="Text cannot be empty")
104
 
@@ -106,64 +98,78 @@ async def analyze_text(data: TextInput, token: str = Depends(bearer_scheme)):
106
  raise HTTPException(status_code=400, detail="Text must contain at least two words")
107
 
108
  try:
 
109
  label, perplexity = await asyncio.to_thread(classify_text, text)
110
  return {"result": label, "perplexity": round(perplexity, 2)}
111
  except Exception as e:
112
- logging.error(f"Text analysis failed: {str(e)}")
113
  raise HTTPException(status_code=500, detail="Model processing error")
114
 
115
- # -------- File Upload and Parsing -------- #
116
  def parse_docx(file: BytesIO):
117
  doc = docx.Document(file)
118
- return "\n".join(para.text for para in doc.paragraphs)
 
 
 
119
 
 
120
  def parse_pdf(file: BytesIO):
121
  try:
122
  doc = fitz.open(stream=file, filetype="pdf")
123
- return "".join([doc.load_page(i).get_text() for i in range(doc.page_count)])
 
 
 
 
124
  except Exception as e:
125
- logging.error(f"PDF error: {str(e)}")
126
- raise HTTPException(status_code=500, detail="Error processing PDF")
127
 
 
128
  def parse_txt(file: BytesIO):
129
  return file.read().decode("utf-8")
130
 
 
131
  @app.post("/upload/")
132
  async def upload_file(file: UploadFile = File(...), token: str = Depends(bearer_scheme)):
133
- if token.credentials != SECRET_TOKEN:
134
- raise HTTPException(status_code=401, detail="Invalid token")
135
-
136
  try:
137
- content_type = file.content_type
138
- content = await file.read()
139
- if content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
140
- text = parse_docx(BytesIO(content))
141
- elif content_type == 'application/pdf':
142
- text = parse_pdf(BytesIO(content))
143
- elif content_type == 'text/plain':
144
- text = parse_txt(BytesIO(content))
145
  else:
146
- raise HTTPException(status_code=400, detail="Unsupported file type")
 
 
147
 
148
- if len(text) > 10000:
 
149
  return {"message": "File contains more than 10,000 characters."}
150
 
151
- cleaned_text = text.replace("\n", "").replace("\t", "")
 
 
 
152
  label, perplexity = await asyncio.to_thread(classify_text, cleaned_text)
153
  return {"result": label, "perplexity": round(perplexity, 2)}
154
 
155
  except Exception as e:
156
- logging.error(f"File processing error: {str(e)}")
157
- raise HTTPException(status_code=500, detail="Error processing file")
158
 
159
- # Health Check and Index
160
  @app.get("/health")
161
- def health_check():
162
  return {"status": "ok"}
163
 
 
164
  @app.get("/")
165
  def index():
166
  return {
167
  "message": "FastAPI AI Text Detector is running.",
168
- "usage": "Use /docs or /analyze or /upload to test the API."
169
  }
 
2
  from fastapi.security import HTTPBearer
3
  from pydantic import BaseModel
4
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
 
 
 
5
  import torch
 
 
6
  import os
7
+ import asyncio
8
+ from contextlib import asynccontextmanager
9
  import logging
10
  from io import BytesIO
11
+ import docx
12
+ import fitz # PyMuPDF
 
13
 
14
  # Load environment variables
15
+ from dotenv import load_dotenv
16
  load_dotenv()
17
+
18
  SECRET_TOKEN = os.getenv("SECRET_TOKEN")
19
+ bearer_scheme = HTTPBearer()
20
 
21
+ model_path = "./Ai-Text-Detector/model"
22
+ weights_path = "./Ai-Text-Detector/model_weights.pth"
 
23
 
24
+ # FastAPI app instance
25
+ app = FastAPI()
 
 
26
 
27
+ # Global model and tokenizer variables
28
+ model, tokenizer = None, None
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
 
31
+ # Logging setup
32
+ logging.basicConfig(level=logging.DEBUG)
 
33
 
34
+ # Load model and tokenizer function
35
  def load_model():
36
  global model, tokenizer
37
  try:
 
41
  model_instance.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
42
  model_instance.to(device)
43
  model_instance.eval()
44
+ model, tokenizer = model_instance, tokenizer
45
+ logging.info("Model loaded successfully.")
46
  except Exception as e:
47
+ logging.error(f"Error loading model: {str(e)}")
48
  raise RuntimeError(f"Error loading model: {str(e)}")
49
 
50
+ # Load model on app startup
51
  @asynccontextmanager
52
  async def lifespan(app: FastAPI):
53
+ load_model() # Load model when FastAPI app starts
54
  yield
55
 
56
+ # Attach the lifespan to the app instance
57
  app = FastAPI(lifespan=lifespan)
58
 
59
+ # Input schema for text analysis
60
+ class TextInput(BaseModel):
61
+ text: str
62
+
63
+ # Function to classify text using the model
64
  def classify_text(text: str):
65
  if not model or not tokenizer:
66
  raise RuntimeError("Model or tokenizer not loaded.")
67
 
68
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
 
 
 
 
 
 
 
69
  input_ids = inputs["input_ids"].to(device)
70
  attention_mask = inputs["attention_mask"].to(device)
71
 
 
81
  else:
82
  return "Human-written", perplexity
83
 
84
+ # POST route to analyze text with Bearer token
 
 
 
 
85
  @app.post("/analyze")
86
  async def analyze_text(data: TextInput, token: str = Depends(bearer_scheme)):
87
+ # Verify token
88
  if token.credentials != SECRET_TOKEN:
89
  raise HTTPException(status_code=401, detail="Invalid token")
90
 
91
  text = data.text.strip()
92
+
93
+ # Input validation
94
  if not text:
95
  raise HTTPException(status_code=400, detail="Text cannot be empty")
96
 
 
98
  raise HTTPException(status_code=400, detail="Text must contain at least two words")
99
 
100
  try:
101
+ # Classify text
102
  label, perplexity = await asyncio.to_thread(classify_text, text)
103
  return {"result": label, "perplexity": round(perplexity, 2)}
104
  except Exception as e:
105
+ logging.error(f"Error processing text: {str(e)}")
106
  raise HTTPException(status_code=500, detail="Model processing error")
107
 
108
+ # Function to parse .docx files
109
  def parse_docx(file: BytesIO):
110
  doc = docx.Document(file)
111
+ text = ""
112
+ for para in doc.paragraphs:
113
+ text += para.text + "\n"
114
+ return text
115
 
116
+ # Function to parse .pdf files
117
  def parse_pdf(file: BytesIO):
118
  try:
119
  doc = fitz.open(stream=file, filetype="pdf")
120
+ text = ""
121
+ for page_num in range(doc.page_count):
122
+ page = doc.load_page(page_num)
123
+ text += page.get_text()
124
+ return text
125
  except Exception as e:
126
+ logging.error(f"Error while processing PDF: {str(e)}")
127
+ raise HTTPException(status_code=500, detail="Error processing PDF file")
128
 
129
+ # Function to parse .txt files
130
  def parse_txt(file: BytesIO):
131
  return file.read().decode("utf-8")
132
 
133
+ # POST route to upload files and analyze content
134
  @app.post("/upload/")
135
  async def upload_file(file: UploadFile = File(...), token: str = Depends(bearer_scheme)):
136
+ file_contents = None
 
 
137
  try:
138
+ if file.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
139
+ file_contents = parse_docx(BytesIO(await file.read()))
140
+ elif file.content_type == 'application/pdf':
141
+ file_contents = parse_pdf(BytesIO(await file.read()))
142
+ elif file.content_type == 'text/plain':
143
+ file_contents = parse_txt(BytesIO(await file.read()))
 
 
144
  else:
145
+ raise HTTPException(status_code=400, detail="Invalid file type. Only .docx, .pdf, and .txt are allowed.")
146
+
147
+ logging.debug(f"Extracted Text from {file.filename}:\n{file_contents}")
148
 
149
+ # Check if the text length exceeds 10,000 characters
150
+ if len(file_contents) > 10000:
151
  return {"message": "File contains more than 10,000 characters."}
152
 
153
+ # Clean the text by removing newline and tab characters
154
+ cleaned_text = file_contents.replace("\n", "").replace("\t", "")
155
+
156
+ # Analyze the cleaned text
157
  label, perplexity = await asyncio.to_thread(classify_text, cleaned_text)
158
  return {"result": label, "perplexity": round(perplexity, 2)}
159
 
160
  except Exception as e:
161
+ logging.error(f"Error processing file: {str(e)}")
162
+ raise HTTPException(status_code=500, detail="Error processing the file")
163
 
164
+ # Health check route
165
  @app.get("/health")
166
+ async def health_check():
167
  return {"status": "ok"}
168
 
169
+ # Simple index route
170
  @app.get("/")
171
  def index():
172
  return {
173
  "message": "FastAPI AI Text Detector is running.",
174
+ "usage": "Use /docs or /analyze to test the API."
175
  }