Pujan-Dev commited on
Commit
6b04257
·
1 Parent(s): f6f16d9

fixed the bias

Browse files
features/nepali_text_classifier/controller.py CHANGED
@@ -23,6 +23,41 @@ def contains_english(text: str) -> bool:
23
  return bool(re.search(r'[a-zA-Z]', cleaned))
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
27
  token = credentials.credentials
28
  expected_token = Config.SECRET_TOKEN
@@ -38,8 +73,8 @@ async def nepali_text_analysis(text: str, models: str | None = None):
38
  words = text.split()
39
  if len(words) < 10:
40
  raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
41
- if len(text) > 10000:
42
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
43
 
44
  selected_models = parse_selected_models(models)
45
  result = await asyncio.to_thread(classify_text, text, selected_models, 2)
@@ -64,8 +99,8 @@ async def handle_file_upload(file: UploadFile, models: str | None = None):
64
  try:
65
  file_contents = await extract_file_contents(file)
66
  end_symbol_for_NP_text(file_contents)
67
- if len(file_contents) > 10000:
68
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
69
 
70
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
71
  if not cleaned_text:
@@ -82,8 +117,8 @@ async def handle_file_upload(file: UploadFile, models: str | None = None):
82
 
83
  async def handle_sentence_level_analysis(text: str, models: str | None = None):
84
  text = text.strip()
85
- if len(text) > 10000:
86
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
87
 
88
  end_symbol_for_NP_text(text)
89
 
@@ -91,14 +126,19 @@ async def handle_sentence_level_analysis(text: str, models: str | None = None):
91
  sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
92
  selected_models = parse_selected_models(models)
93
 
 
 
 
 
94
  results = []
95
  for sentence in sentences:
96
  end_symbol_for_NP_text(sentence)
97
  result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
 
98
  results.append({
99
  "text": sentence,
100
- "result": result["label"],
101
- "likelihood": result["confidence"]
102
  })
103
 
104
  return {"analysis": results}
@@ -107,8 +147,8 @@ async def handle_sentence_level_analysis(text: str, models: str | None = None):
107
  async def handle_file_sentence(file:UploadFile, models: str | None = None):
108
  try:
109
  file_contents = await extract_file_contents(file)
110
- if len(file_contents) > 10000:
111
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
112
 
113
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
114
  if not cleaned_text:
@@ -119,15 +159,20 @@ async def handle_file_sentence(file:UploadFile, models: str | None = None):
119
  sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
120
  selected_models = parse_selected_models(models)
121
 
 
 
 
 
122
  results = []
123
  for sentence in sentences:
124
  end_symbol_for_NP_text(sentence)
125
 
126
  result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
 
127
  results.append({
128
  "text": sentence,
129
- "result": result["label"],
130
- "likelihood": result["confidence"]
131
  })
132
 
133
  return {"analysis": results}
 
23
  return bool(re.search(r'[a-zA-Z]', cleaned))
24
 
25
 
26
+ def _clamp(value: float, lower: float, upper: float) -> float:
27
+ return max(lower, min(upper, value))
28
+
29
+
30
+ def _raw_ai_score(label: str, confidence: float) -> float:
31
+ conf = _clamp(float(confidence), 0.0, 100.0)
32
+ return conf if label == "AI" else (100.0 - conf)
33
+
34
+
35
+ def _sentence_bias_strength(overall_confidence: float) -> float:
36
+ # Stronger bias so sentence output follows the overall document decision.
37
+ # Equation: beta = min(0.80, 0.40 + 0.40 * (C_doc / 100))
38
+ return min(0.80, 0.40 + 0.40 * (_clamp(overall_confidence, 0.0, 100.0) / 100.0))
39
+
40
+
41
+ def _biased_sentence_result(sentence_result: dict, overall_confidence: float, target_label: str = "Human") -> dict:
42
+ raw_label = sentence_result["label"]
43
+ raw_confidence = float(sentence_result["confidence"])
44
+ raw_ai = _raw_ai_score(raw_label, raw_confidence)
45
+
46
+ target_ai = 100.0 if target_label == "AI" else 0.0
47
+ beta = _sentence_bias_strength(overall_confidence)
48
+
49
+ # Equation: S_biased = (1 - beta) * S_raw + beta * T
50
+ biased_ai = _clamp((1.0 - beta) * raw_ai + beta * target_ai, 0.0, 100.0)
51
+ # Force final label toward overall target to ensure overall bias is applied.
52
+ biased_label = target_label
53
+ biased_confidence = biased_ai if target_label == "AI" else (100.0 - biased_ai)
54
+
55
+ return {
56
+ "biased_label": biased_label,
57
+ "biased_confidence": round(biased_confidence, 2),
58
+ }
59
+
60
+
61
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
62
  token = credentials.credentials
63
  expected_token = Config.SECRET_TOKEN
 
73
  words = text.split()
74
  if len(words) < 10:
75
  raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
76
+ if len(text) > 50000:
77
+ raise HTTPException(status_code=413, detail="Text must be less than 50 ,000 characters")
78
 
79
  selected_models = parse_selected_models(models)
80
  result = await asyncio.to_thread(classify_text, text, selected_models, 2)
 
99
  try:
100
  file_contents = await extract_file_contents(file)
101
  end_symbol_for_NP_text(file_contents)
102
+ if len(file_contents) > 50000:
103
+ raise HTTPException(status_code=413, detail="Text must be less than 50,000 characters")
104
 
105
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
106
  if not cleaned_text:
 
117
 
118
  async def handle_sentence_level_analysis(text: str, models: str | None = None):
119
  text = text.strip()
120
+ if len(text) > 50000:
121
+ raise HTTPException(status_code=413, detail="Text must be less than 50,000 characters")
122
 
123
  end_symbol_for_NP_text(text)
124
 
 
126
  sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
127
  selected_models = parse_selected_models(models)
128
 
129
+ overall = await asyncio.to_thread(classify_text, text, selected_models, 2)
130
+ overall_label = overall["label"]
131
+ overall_confidence = float(overall["confidence"])
132
+
133
  results = []
134
  for sentence in sentences:
135
  end_symbol_for_NP_text(sentence)
136
  result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
137
+ biased = _biased_sentence_result(result, overall_confidence, target_label=overall_label)
138
  results.append({
139
  "text": sentence,
140
+ "result": biased["biased_label"],
141
+ "likelihood": biased["biased_confidence"],
142
  })
143
 
144
  return {"analysis": results}
 
147
  async def handle_file_sentence(file:UploadFile, models: str | None = None):
148
  try:
149
  file_contents = await extract_file_contents(file)
150
+ if len(file_contents) > 50000:
151
+ raise HTTPException(status_code=413, detail="Text must be less than 50,000 characters")
152
 
153
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
154
  if not cleaned_text:
 
159
  sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
160
  selected_models = parse_selected_models(models)
161
 
162
+ overall = await asyncio.to_thread(classify_text, cleaned_text, selected_models, 2)
163
+ overall_label = overall["label"]
164
+ overall_confidence = float(overall["confidence"])
165
+
166
  results = []
167
  for sentence in sentences:
168
  end_symbol_for_NP_text(sentence)
169
 
170
  result = await asyncio.to_thread(classify_text, sentence, selected_models, 2)
171
+ biased = _biased_sentence_result(result, overall_confidence, target_label=overall_label)
172
  results.append({
173
  "text": sentence,
174
+ "result": biased["biased_label"],
175
+ "likelihood": biased["biased_confidence"],
176
  })
177
 
178
  return {"analysis": results}