Spaces:

can-org
/

Testing-AI-Contain

Sleeping

App Files Files Community

Pujan-Dev commited on May 27, 2025

Commit

5857799

1 Parent(s): 2f3ca4a

feat: added files , sentence wise text detector for np language

Browse files

Files changed (6) hide show

.gitignore +1 -0
features/nepali_text_classifier/controller.py +102 -7
features/nepali_text_classifier/inferencer.py +2 -0
features/nepali_text_classifier/preprocess.py +6 -0
features/nepali_text_classifier/routes.py +19 -3
features/text_classifier/controller.py +1 -1

.gitignore CHANGED Viewed

@@ -59,3 +59,4 @@ model/
 models/.gitattributes  #<-- This line can stay if you only want to ignore that file, not the whole folder
 todo.md

 models/.gitattributes  #<-- This line can stay if you only want to ignore that file, not the whole folder
 todo.md
+np_text_model

features/nepali_text_classifier/controller.py CHANGED Viewed

@@ -1,12 +1,21 @@
 import asyncio
-from fastapi import HTTPException, status, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 import os
 from features.nepali_text_classifier.inferencer import classify_text
 security = HTTPBearer()
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     token = credentials.credentials
     expected_token = os.getenv("MY_SECRET_TOKEN")
@@ -18,18 +27,104 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
     return token
 async def nepali_text_analysis(text: str):
-    # Fix: split once and reuse
     words = text.split()
     if len(words) < 10:
         raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    label, confidence = await asyncio.to_thread(classify_text, text)
-    return {
-        "result": label,
-        "ai_likelihood": confidence
-    }
 def classify(text: str):
     return classify_text(text)

 import asyncio
+from io import BytesIO
+from fastapi import HTTPException, UploadFile, status, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 import os
 from features.nepali_text_classifier.inferencer import classify_text
+from  features.nepali_text_classifier.preprocess import *
+import re
 security = HTTPBearer()
+def contains_english(text: str) -> bool:
+    # Remove escape characters
+    cleaned = text.replace("\n", "").replace("\t", "")
+    return bool(re.search(r'[a-zA-Z]', cleaned))
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
     token = credentials.credentials
     expected_token = os.getenv("MY_SECRET_TOKEN")
     return token
 async def nepali_text_analysis(text: str):
+    end_symbol_for_NP_text(text)
     words = text.split()
     if len(words) < 10:
         raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    result = await asyncio.to_thread(classify_text, text)
+    return result
+#Extract text form uploaded files(.docx,.pdf,.txt)
+async def extract_file_contents(file:UploadFile)-> str:
+    content = await file.read()
+    file_stream = BytesIO(content)
+    if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return parse_docx(file_stream)
+    elif file.content_type =="application/pdf":
+        return parse_pdf(file_stream)
+    elif file.content_type =="text/plain":
+        return parse_txt(file_stream)
+    else:
+        raise HTTPException(status_code=415,detail="Invalid file type. Only .docx,.pdf and .txt are allowed")
+async def handle_file_upload(file: UploadFile):
+    try:
+        file_contents = await extract_file_contents(file)
+        end_symbol_for_NP_text(file_contents)
+        if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
+        if not cleaned_text:
+            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        result = await asyncio.to_thread(classify_text, cleaned_text)
+        return result
+    except Exception as e:
+        logging.error(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail="Error processing the file")
+async def handle_sentence_level_analysis(text: str):
+    text = text.strip()
+    if len(text) > 10000:
+        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    end_symbol_for_NP_text(text)
+    # Split text into sentences
+    sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
+    results = []
+    for sentence in sentences:
+        end_symbol_for_NP_text(sentence)
+        result = await asyncio.to_thread(classify_text, sentence)
+        results.append({
+            "text": sentence,
+            "result": result["label"],
+            "likelihood": result["confidence"]
+        })
+    return {"analysis": results}
+async def handle_file_sentence(file:UploadFile):
+    try:
+        file_contents = await extract_file_contents(file)
+        if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
+        if not cleaned_text:
+            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        # Ensure text ends with danda so last sentence is included
+        # Split text into sentences
+        sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
+        results = []
+        for sentence in sentences:
+            end_symbol_for_NP_text(sentence)
+            result = await asyncio.to_thread(classify_text, sentence)
+            results.append({
+                "text": sentence,
+                "result": result["label"],
+                "likelihood": result["confidence"]
+            })
+        return {"analysis": results}
+    except Exception as e:
+        logging.error(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail="Error processing the file")
 def classify(text: str):
     return classify_text(text)

features/nepali_text_classifier/inferencer.py CHANGED Viewed

	@@ -19,3 +19,5 @@ def classify_text(text: str):
19
20	return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}
21


19
20	return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}
21
22	+
23	+

features/nepali_text_classifier/preprocess.py CHANGED Viewed

@@ -30,3 +30,9 @@ def parse_pdf(file: BytesIO):
 def parse_txt(file: BytesIO):
     return file.read().decode("utf-8")

 def parse_txt(file: BytesIO):
     return file.read().decode("utf-8")
+def end_symbol_for_NP_text(text):
+        if not text.endswith("।"):
+            text += "।"

features/nepali_text_classifier/routes.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from slowapi import Limiter
 from config import ACCESS_RATE
-from .controller import nepali_text_analysis
 from .inferencer import classify_text
-from fastapi import APIRouter, Request, Depends, HTTPException
 from fastapi.security import HTTPBearer
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from pydantic import BaseModel
 router = APIRouter()
 limiter = Limiter(key_func=get_remote_address)
 security = HTTPBearer()
@@ -18,10 +19,25 @@ class TextInput(BaseModel):
 @router.post("/analyse")
 @limiter.limit(ACCESS_RATE)
 async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
-    # Token is available as `token.credentials`, add validation if needed
     result = classify_text(data.text)
     return result
 @router.get("/health")
 @limiter.limit(ACCESS_RATE)
 def health(request: Request):

 from slowapi import Limiter
 from config import ACCESS_RATE
+from .controller import handle_file_sentence, handle_sentence_level_analysis, nepali_text_analysis
 from .inferencer import classify_text
+from fastapi import APIRouter, File, Request, Depends, HTTPException, UploadFile
 from fastapi.security import HTTPBearer
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from pydantic import BaseModel
+from .controller import handle_file_upload
 router = APIRouter()
 limiter = Limiter(key_func=get_remote_address)
 security = HTTPBearer()
 @router.post("/analyse")
 @limiter.limit(ACCESS_RATE)
 async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
     result = classify_text(data.text)
     return result
+@router.post("/upload")
+@limiter.limit(ACCESS_RATE)
+async def upload_file(request:Request,file:UploadFile=File(...),token:str=Depends(security)):
+    return await handle_file_upload(file)
+@router.post("/analyse-sentences")
+@limiter.limit(ACCESS_RATE)
+async def upload_file(request:Request,data:TextInput,token:str=Depends(security)):
+    return await  handle_sentence_level_analysis(data.text)
+@router.post("/file-sentences-analyse")
+@limiter.limit(ACCESS_RATE)
+async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(security)):
+    return await handle_file_sentence(file)
 @router.get("/health")
 @limiter.limit(ACCESS_RATE)
 def health(request: Request):

features/text_classifier/controller.py CHANGED Viewed

@@ -52,7 +52,7 @@ async def extract_file_contents(file: UploadFile) -> str:
     else:
         raise HTTPException(
             status_code=415,
-            detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
         )
 # Classify text from uploaded file

     else:
         raise HTTPException(
             status_code=415,
+            detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
         )
 # Classify text from uploaded file