Spaces:

can-org
/

Testing-AI-Contain

Running

Pujan-Dev commited on Feb 11

Commit

9c8da7e

1 Parent(s): b7c5baf

Added the changes for the python pdf reader

Files changed (5) hide show

.env-example DELETED Viewed

	@@ -1,2 +0,0 @@
1	- MY_SECRET_TOKEN="SECRET_CODE_TOKEN"
2	-

features/nepali_text_classifier/preprocess.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import fitz  # PyMuPDF
 import docx
 from io import BytesIO
 import logging
 from fastapi import HTTPException
 def parse_docx(file: BytesIO):
     doc = docx.Document(file)
@@ -15,11 +15,11 @@ def parse_docx(file: BytesIO):
 def parse_pdf(file: BytesIO):
     try:
-        doc = fitz.open(stream=file, filetype="pdf")
         text = ""
-        for page_num in range(doc.page_count):
-            page = doc.load_page(page_num)
-            text += page.get_text()
         return text
     except Exception as e:
         logging.error(f"Error while processing PDF: {str(e)}")

+# import fitz  # PyMuPDF
 import docx
 from io import BytesIO
 import logging
 from fastapi import HTTPException
+from pypdf import PdfReader
 def parse_docx(file: BytesIO):
     doc = docx.Document(file)
 def parse_pdf(file: BytesIO):
     try:
+        doc = PdfReader(file)
         text = ""
+        for page in doc.pages:
+            text += page.extract_text()
+            # print(text)
         return text
     except Exception as e:
         logging.error(f"Error while processing PDF: {str(e)}")

features/text_classifier/controller.py CHANGED Viewed

@@ -60,12 +60,12 @@ async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
             raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
         label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
         return {
             "content": file_contents,
@@ -102,12 +102,15 @@ async def handle_sentence_level_analysis(text: str):
             "ai_likelihood": ai_likelihood
         })
-    return {"analysis": results}# Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:

     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
             raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        print(f"Cleaned text: '{cleaned_text}'")  # Debugging statement
         label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
         return {
             "content": file_contents,
             "ai_likelihood": ai_likelihood
         })
+    return {"analysis": results}
+# Analyze each sentence from uploaded file
 async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+            return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:

features/text_classifier/preprocess.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import fitz  # PyMuPDF
 import docx
 from io import BytesIO
 import logging
@@ -15,18 +15,17 @@ def parse_docx(file: BytesIO):
 def parse_pdf(file: BytesIO):
     try:
-        doc = fitz.open(stream=file, filetype="pdf")
         text = ""
-        for page_num in range(doc.page_count):
-            page = doc.load_page(page_num)
-            text += page.get_text()
-        return text
     except Exception as e:
         logging.error(f"Error while processing PDF: {str(e)}")
         raise HTTPException(
             status_code=500, detail="Error processing PDF file")
 def parse_txt(file: BytesIO):
     return file.read().decode("utf-8")

+from pypdf import PdfReader
 import docx
 from io import BytesIO
 import logging
 def parse_pdf(file: BytesIO):
     try:
+        doc = PdfReader(file)
         text = ""
+        for page in doc.pages:
+            text += page.extract_text()
+            # print(text)
+        return text
     except Exception as e:
         logging.error(f"Error while processing PDF: {str(e)}")
         raise HTTPException(
             status_code=500, detail="Error processing PDF file")
 def parse_txt(file: BytesIO):
     return file.read().decode("utf-8")

requirements.txt CHANGED Viewed

@@ -15,6 +15,6 @@ tensorflow
 opencv-python
 pillow
 scipy
-fitz
 frontend
 tools

 opencv-python
 pillow
 scipy
+pypdf
 frontend
 tools