Pujan-Dev commited on
Commit
9afba1d
·
1 Parent(s): 753c2d1

refact: added the base 10,000 words limit

Browse files
.gitignore CHANGED
@@ -58,4 +58,4 @@ node_modules/
58
  model/
59
  models/.gitattributes #<-- This line can stay if you only want to ignore that file, not the whole folder
60
 
61
- #
 
58
  model/
59
  models/.gitattributes #<-- This line can stay if you only want to ignore that file, not the whole folder
60
 
61
+ todo.md
app.py CHANGED
@@ -1,26 +1,36 @@
1
- from fastapi import FastAPI
 
 
2
  from contextlib import asynccontextmanager
3
  from features.text_classifier.routes import router as text_classifier_router
4
  from features.text_classifier.model_loader import warmup
5
  import nltk
6
-
 
 
 
7
  @asynccontextmanager
8
  async def lifespan(app: FastAPI):
9
-
10
- # Your model warmup
11
  warmup()
12
  yield
13
- # Optionally add cleanup here
14
 
 
15
 
16
- # Pass lifespan handler to FastAPI constructor
17
  app = FastAPI(lifespan=lifespan)
 
 
18
 
19
- app.include_router(text_classifier_router, prefix="/text", tags=["Text Classification"])
20
 
21
  @app.get("/")
22
  def index():
23
  return {
24
  "Message": "FastAPI is running...",
25
- "Try": "/text/analyze or /text/analyze-sentences"
26
  }
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ from fastapi import FastAPI, Request
4
  from contextlib import asynccontextmanager
5
  from features.text_classifier.routes import router as text_classifier_router
6
  from features.text_classifier.model_loader import warmup
7
  import nltk
8
+ from slowapi import Limiter, _rate_limit_exceeded_handler
9
+ from slowapi.util import get_remote_address
10
+ from slowapi.errors import RateLimitExceeded
11
+ import requests
12
  @asynccontextmanager
13
  async def lifespan(app: FastAPI):
 
 
14
  warmup()
15
  yield
 
16
 
17
+ limiter = Limiter(key_func=get_remote_address)
18
 
 
19
  app = FastAPI(lifespan=lifespan)
20
+ app.state.limiter = limiter
21
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
22
 
23
+ app.include_router(text_classifier_router, prefix="/text")
24
 
25
  @app.get("/")
26
  def index():
27
  return {
28
  "Message": "FastAPI is running...",
29
+ "Try": "/text/analyse or /text/analyse-sentences"
30
  }
31
+
32
+ @app.get("/home")
33
+ @limiter.limit("5/minute")
34
+ async def homepage(request: Request):
35
+ return {"msg": "This is a good message"}
36
+
config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ACCESS_RATE=10
features/text_classifier/controller.py CHANGED
@@ -1,73 +1,75 @@
1
  from .inferencer import classify_text
2
  import asyncio
3
- from fastapi import HTTPException, UploadFile,status,Depends
4
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
  from .preprocess import parse_docx, parse_pdf, parse_txt
6
  from nltk.tokenize import sent_tokenize
7
  import os
8
  from io import BytesIO
9
  import logging
10
-
11
-
12
  security = HTTPBearer()
13
 
 
14
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
15
  token = credentials.credentials
16
- if token != os.getenv("MY_SECRET_TOKEN"): # Replace with your actual secret
17
  raise HTTPException(
18
  status_code=status.HTTP_403_FORBIDDEN,
19
  detail="Invalid or expired token"
20
  )
21
  return token
22
 
23
-
24
-
25
-
26
  async def handle_text_analysis(text: str):
27
  text = text.strip()
28
  if not text or len(text.split()) < 2:
29
- raise HTTPException(
30
- status_code=400, detail="Text must contain at least two words"
31
- )
32
- label, perplexity,ai_likelihood = await asyncio.to_thread(classify_text, text)
33
- return {"result": label, "perplexity": round(int(perplexity), 2),"ai_likelihood":ai_likelihood}
34
-
35
 
 
36
  async def handle_file_sentance(file: UploadFile):
37
  try:
38
  file_contents = await extract_file_contents(file)
39
  if len(file_contents) > 10000:
40
  return {"message": "File contains more than 10,000 characters."}
41
- cleaned_text = file_contents.replace("\n", "").replace("\t", "")
 
 
42
  result = await handle_sentence_level_analysis(cleaned_text)
43
  return {"content": file_contents, **result}
44
  except Exception as e:
45
  logging.error(f"Error processing file: {str(e)}")
46
  raise HTTPException(status_code=500, detail="Error processing the file")
47
 
48
-
49
-
50
  async def handle_file_upload(file: UploadFile):
51
  try:
52
  file_contents = await extract_file_contents(file)
53
  if len(file_contents) > 10000:
54
  return {"message": "File contains more than 10,000 characters."}
55
- cleaned_text = file_contents.replace("\n", "").replace("\t", "")
56
- label, perplexity,ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
57
- return {"content":file_contents,"result": label, "perplexity": round(int(perplexity), 2),"ai_likelihood":ai_likelihood}
 
 
 
 
 
 
 
58
  except Exception as e:
59
  logging.error(f"Error processing file: {str(e)}")
60
  raise HTTPException(status_code=500, detail="Error processing the file")
61
 
62
-
63
  async def extract_file_contents(file: UploadFile):
64
  content = await file.read()
65
  file_stream = BytesIO(content)
66
-
67
- if (
68
- file.content_type
69
- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
70
- ):
71
  return parse_docx(file_stream)
72
  elif file.content_type == "application/pdf":
73
  return parse_pdf(file_stream)
@@ -76,20 +78,23 @@ async def extract_file_contents(file: UploadFile):
76
  else:
77
  raise HTTPException(
78
  status_code=400,
79
- detail="Invalid file type. Only .docx, .pdf, and .txt are allowed.",
80
  )
81
 
 
82
  async def handle_sentence_level_analysis(text: str):
83
  text = text.strip()
84
  if not text or len(text.split()) < 2:
85
- raise HTTPException(
86
- status_code=400, detail="Text must contain at least two words"
87
- )
88
 
89
- sentences = sent_tokenize(text,language="english")
90
- results = []
91
 
 
 
92
  for sentence in sentences:
 
 
93
  label, perplexity, likelihood = await asyncio.to_thread(classify_text, sentence)
94
  results.append({
95
  "sentence": sentence,
@@ -97,8 +102,9 @@ async def handle_sentence_level_analysis(text: str):
97
  "perplexity": round(perplexity, 2),
98
  "ai_likelihood": likelihood
99
  })
100
-
101
  return {"analysis": results}
102
 
 
103
  def classify(text: str):
104
  return classify_text(text)
 
 
1
  from .inferencer import classify_text
2
  import asyncio
3
+ from fastapi import HTTPException, UploadFile, status, Depends,requests
4
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
  from .preprocess import parse_docx, parse_pdf, parse_txt
6
  from nltk.tokenize import sent_tokenize
7
  import os
8
  from io import BytesIO
9
  import logging
10
+ import requests
 
11
  security = HTTPBearer()
12
 
13
+ # Token verification
14
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
15
  token = credentials.credentials
16
+ if token != os.getenv("MY_SECRET_TOKEN"):
17
  raise HTTPException(
18
  status_code=status.HTTP_403_FORBIDDEN,
19
  detail="Invalid or expired token"
20
  )
21
  return token
22
 
23
+ # Text classification
 
 
24
  async def handle_text_analysis(text: str):
25
  text = text.strip()
26
  if not text or len(text.split()) < 2:
27
+ raise HTTPException(status_code=400, detail="Text must contain at least two words")
28
+ if len(text) > 10000:
29
+ raise HTTPException(status_code=400, detail="The text should be less than 10,000 characters.")
30
+ label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
31
+ return {"result": label, "perplexity": round(perplexity, 2), "ai_likelihood": ai_likelihood}
 
32
 
33
+ # File sentence-level analysis
34
  async def handle_file_sentance(file: UploadFile):
35
  try:
36
  file_contents = await extract_file_contents(file)
37
  if len(file_contents) > 10000:
38
  return {"message": "File contains more than 10,000 characters."}
39
+ cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
40
+ if not cleaned_text:
41
+ raise HTTPException(status_code=400, detail="The file is empty or only contains whitespace.")
42
  result = await handle_sentence_level_analysis(cleaned_text)
43
  return {"content": file_contents, **result}
44
  except Exception as e:
45
  logging.error(f"Error processing file: {str(e)}")
46
  raise HTTPException(status_code=500, detail="Error processing the file")
47
 
48
+ # File-level classification
 
49
  async def handle_file_upload(file: UploadFile):
50
  try:
51
  file_contents = await extract_file_contents(file)
52
  if len(file_contents) > 10000:
53
  return {"message": "File contains more than 10,000 characters."}
54
+ cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
55
+ if not cleaned_text:
56
+ raise HTTPException(status_code=400, detail="The file is empty or only contains whitespace.")
57
+ label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
58
+ return {
59
+ "content": file_contents,
60
+ "result": label,
61
+ "perplexity": round(perplexity, 2),
62
+ "ai_likelihood": ai_likelihood
63
+ }
64
  except Exception as e:
65
  logging.error(f"Error processing file: {str(e)}")
66
  raise HTTPException(status_code=500, detail="Error processing the file")
67
 
68
+ # File extraction
69
  async def extract_file_contents(file: UploadFile):
70
  content = await file.read()
71
  file_stream = BytesIO(content)
72
+ if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
 
 
 
 
73
  return parse_docx(file_stream)
74
  elif file.content_type == "application/pdf":
75
  return parse_pdf(file_stream)
 
78
  else:
79
  raise HTTPException(
80
  status_code=400,
81
+ detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
82
  )
83
 
84
+ # Sentence-level analysis
85
  async def handle_sentence_level_analysis(text: str):
86
  text = text.strip()
87
  if not text or len(text.split()) < 2:
88
+ raise HTTPException(status_code=400, detail="Text must contain at least two words")
 
 
89
 
90
+ if len(text) > 10000:
91
+ raise HTTPException(status_code=400, detail="Text must be less than 10,000 characters.")
92
 
93
+ sentences = sent_tokenize(text, language="english")
94
+ results = []
95
  for sentence in sentences:
96
+ if not sentence.strip():
97
+ continue
98
  label, perplexity, likelihood = await asyncio.to_thread(classify_text, sentence)
99
  results.append({
100
  "sentence": sentence,
 
102
  "perplexity": round(perplexity, 2),
103
  "ai_likelihood": likelihood
104
  })
 
105
  return {"analysis": results}
106
 
107
+ # Synchronous call
108
  def classify(text: str):
109
  return classify_text(text)
110
+
features/text_classifier/preprocess.py CHANGED
@@ -3,6 +3,8 @@ import docx
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
 
 
6
  def parse_docx(file: BytesIO):
7
  doc = docx.Document(file)
8
  text = ""
@@ -27,3 +29,4 @@ def parse_pdf(file: BytesIO):
27
 
28
  def parse_txt(file: BytesIO):
29
  return file.read().decode("utf-8")
 
 
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
6
+
7
+
8
  def parse_docx(file: BytesIO):
9
  doc = docx.Document(file)
10
  text = ""
 
29
 
30
  def parse_txt(file: BytesIO):
31
  return file.read().decode("utf-8")
32
+
features/text_classifier/routes.py CHANGED
@@ -3,6 +3,9 @@ from fastapi.security import HTTPBearer
3
  from pydantic import BaseModel
4
  from .controller import handle_text_analysis, handle_file_upload, handle_sentence_level_analysis, handle_file_sentance
5
  from .controller import verify_token
 
 
 
6
  router = APIRouter()
7
  security = HTTPBearer()
8
 
 
3
  from pydantic import BaseModel
4
  from .controller import handle_text_analysis, handle_file_upload, handle_sentence_level_analysis, handle_file_sentance
5
  from .controller import verify_token
6
+
7
+
8
+
9
  router = APIRouter()
10
  security = HTTPBearer()
11
 
requirements.txt CHANGED
@@ -8,4 +8,5 @@ python-docx
8
  pydantic
9
  PyMuPDF
10
  nltk
 
11
  python-multipart
 
8
  pydantic
9
  PyMuPDF
10
  nltk
11
+ fastapi-limiter
12
  python-multipart