Pujan-Dev commited on
Commit
eb5aac2
·
1 Parent(s): 7b30c7c

refact: refactor some codes and typos are fixed

Browse files
features/text_classifier/controller.py CHANGED
@@ -1,59 +1,71 @@
1
- from .inferencer import classify_text
2
  import asyncio
3
- from fastapi import HTTPException, UploadFile, status, Depends,requests
 
 
 
4
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
- from .preprocess import parse_docx, parse_pdf, parse_txt
6
  from nltk.tokenize import sent_tokenize
7
- import os
8
- from io import BytesIO
9
- import logging
10
- import requests
11
  security = HTTPBearer()
12
 
13
- # Token verification
14
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
15
  token = credentials.credentials
16
- if token != os.getenv("MY_SECRET_TOKEN"):
 
17
  raise HTTPException(
18
  status_code=status.HTTP_403_FORBIDDEN,
19
  detail="Invalid or expired token"
20
  )
21
  return token
22
 
23
- # Text classification
24
  async def handle_text_analysis(text: str):
25
  text = text.strip()
26
  if not text or len(text.split()) < 10:
27
- raise HTTPException(status_code=400, detail="Text must contain at least two words")
28
  if len(text) > 10000:
29
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters.")
 
30
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
31
- return {"result": label, "perplexity": round(perplexity, 2), "ai_likelihood": ai_likelihood}
 
 
 
 
32
 
33
- # File sentence-level analysis
34
- async def handle_file_sentance(file: UploadFile):
35
- try:
36
- file_contents = await extract_file_contents(file)
37
- if len(file_contents) > 10000:
38
- return {"message": "File contains more than 10,000 characters."}
39
- cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
40
- if not cleaned_text:
41
- raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
42
- result = await handle_sentence_level_analysis(cleaned_text)
43
- return {"content": file_contents, **result}
44
- except Exception as e:
45
- logging.error(f"Error processing file: {str(e)}")
46
- raise HTTPException(status_code=500, detail="Error processing the file")
47
 
48
- # File-level classification
 
 
 
 
 
 
 
 
 
 
 
 
49
  async def handle_file_upload(file: UploadFile):
50
  try:
51
  file_contents = await extract_file_contents(file)
52
  if len(file_contents) > 10000:
53
  return {"message": "File contains more than 10,000 characters."}
 
54
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
55
  if not cleaned_text:
56
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
 
57
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
58
  return {
59
  "content": file_contents,
@@ -62,49 +74,51 @@ async def handle_file_upload(file: UploadFile):
62
  "ai_likelihood": ai_likelihood
63
  }
64
  except Exception as e:
65
- logging.error(f"Error processing file: {str(e)}")
66
  raise HTTPException(status_code=500, detail="Error processing the file")
67
 
68
- # File extraction
69
- async def extract_file_contents(file: UploadFile):
70
- content = await file.read()
71
- file_stream = BytesIO(content)
72
- if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
73
- return parse_docx(file_stream)
74
- elif file.content_type == "application/pdf":
75
- return parse_pdf(file_stream)
76
- elif file.content_type == "text/plain":
77
- return parse_txt(file_stream)
78
- else:
79
- raise HTTPException(
80
- status_code=404,
81
- detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
82
- )
83
-
84
- # Sentence-level analysis
85
  async def handle_sentence_level_analysis(text: str):
86
  text = text.strip()
87
- if not text or len(text.split()) < 2:
88
- raise HTTPException(status_code=413, detail="Text must contain at least two words")
89
 
90
  if len(text) > 10000:
91
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters.")
92
 
93
  sentences = sent_tokenize(text, language="english")
94
  results = []
95
  for sentence in sentences:
96
  if not sentence.strip():
97
  continue
98
- label, perplexity, likelihood = await asyncio.to_thread(classify_text, sentence)
99
  results.append({
100
  "sentence": sentence,
101
  "label": label,
102
  "perplexity": round(perplexity, 2),
103
- "ai_likelihood": likelihood
104
  })
105
  return {"analysis": results}
106
 
107
- # Synchronous call
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def classify(text: str):
109
  return classify_text(text)
110
-
 
1
+ import os
2
  import asyncio
3
+ import logging
4
+ from io import BytesIO
5
+
6
+ from fastapi import HTTPException, UploadFile, status, Depends
7
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 
8
  from nltk.tokenize import sent_tokenize
9
+
10
+ from .inferencer import classify_text
11
+ from .preprocess import parse_docx, parse_pdf, parse_txt
12
+
13
  security = HTTPBearer()
14
 
15
+ # Verify Bearer token from Authorization header
16
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
17
  token = credentials.credentials
18
+ expected_token = os.getenv("MY_SECRET_TOKEN")
19
+ if token != expected_token:
20
  raise HTTPException(
21
  status_code=status.HTTP_403_FORBIDDEN,
22
  detail="Invalid or expired token"
23
  )
24
  return token
25
 
26
+ # Classify plain text input
27
  async def handle_text_analysis(text: str):
28
  text = text.strip()
29
  if not text or len(text.split()) < 10:
30
+ raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
31
  if len(text) > 10000:
32
+ raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
33
+
34
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
35
+ return {
36
+ "result": label,
37
+ "perplexity": round(perplexity, 2),
38
+ "ai_likelihood": ai_likelihood
39
+ }
40
 
41
+ # Extract text from uploaded files (.docx, .pdf, .txt)
42
+ async def extract_file_contents(file: UploadFile) -> str:
43
+ content = await file.read()
44
+ file_stream = BytesIO(content)
 
 
 
 
 
 
 
 
 
 
45
 
46
+ if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
47
+ return parse_docx(file_stream)
48
+ elif file.content_type == "application/pdf":
49
+ return parse_pdf(file_stream)
50
+ elif file.content_type == "text/plain":
51
+ return parse_txt(file_stream)
52
+ else:
53
+ raise HTTPException(
54
+ status_code=415,
55
+ detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
56
+ )
57
+
58
+ # Classify text from uploaded file
59
  async def handle_file_upload(file: UploadFile):
60
  try:
61
  file_contents = await extract_file_contents(file)
62
  if len(file_contents) > 10000:
63
  return {"message": "File contains more than 10,000 characters."}
64
+
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
68
+
69
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
70
  return {
71
  "content": file_contents,
 
74
  "ai_likelihood": ai_likelihood
75
  }
76
  except Exception as e:
77
+ logging.error(f"Error processing file: {e}")
78
  raise HTTPException(status_code=500, detail="Error processing the file")
79
 
80
+ # Analyze each sentence in plain text input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  async def handle_sentence_level_analysis(text: str):
82
  text = text.strip()
 
 
83
 
84
  if len(text) > 10000:
85
+ raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
86
 
87
  sentences = sent_tokenize(text, language="english")
88
  results = []
89
  for sentence in sentences:
90
  if not sentence.strip():
91
  continue
92
+ label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
93
  results.append({
94
  "sentence": sentence,
95
  "label": label,
96
  "perplexity": round(perplexity, 2),
97
+ "ai_likelihood": ai_likelihood
98
  })
99
  return {"analysis": results}
100
 
101
+ # Analyze each sentence from uploaded file
102
+ async def handle_file_sentence(file: UploadFile):
103
+ try:
104
+ file_contents = await extract_file_contents(file)
105
+ if len(file_contents) > 10000:
106
+ return {"message": "File contains more than 10,000 characters."}
107
+
108
+ cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
109
+ if not cleaned_text:
110
+ raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
111
+
112
+ result = await handle_sentence_level_analysis(cleaned_text)
113
+ return {
114
+ "content": file_contents,
115
+ **result
116
+ }
117
+ except Exception as e:
118
+ logging.error(f"Error processing file: {e}")
119
+ raise HTTPException(status_code=500, detail="Error processing the file")
120
+
121
+ # Optional synchronous helper function
122
  def classify(text: str):
123
  return classify_text(text)
124
+
features/text_classifier/routes.py CHANGED
@@ -9,7 +9,7 @@ from .controller import (
9
  handle_text_analysis,
10
  handle_file_upload,
11
  handle_sentence_level_analysis,
12
- handle_file_sentance,
13
  verify_token
14
  )
15
 
@@ -40,7 +40,7 @@ async def analyze_sentences(request: Request, data: TextInput, token: str = Depe
40
  @router.post("/analyse-sentance-file")
41
  @limiter.limit(ACCESS_RATE)
42
  async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
43
- return await handle_file_sentance(file)
44
 
45
  @router.get("/health")
46
  @limiter.limit(ACCESS_RATE)
 
9
  handle_text_analysis,
10
  handle_file_upload,
11
  handle_sentence_level_analysis,
12
+ handle_file_sentence,
13
  verify_token
14
  )
15
 
 
40
  @router.post("/analyse-sentance-file")
41
  @limiter.limit(ACCESS_RATE)
42
  async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(verify_token)):
43
+ return await handle_file_sentence(file)
44
 
45
  @router.get("/health")
46
  @limiter.limit(ACCESS_RATE)