added cronjob

#2
by Pujan-Dev - opened
.env-example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ MY_SECRET_TOKEN="SECRET_CODE_TOKEN"
2
+
features/nepali_text_classifier/preprocess.py CHANGED
@@ -1,9 +1,9 @@
1
- # import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
6
- from pypdf import PdfReader
7
 
8
  def parse_docx(file: BytesIO):
9
  doc = docx.Document(file)
@@ -15,10 +15,11 @@ def parse_docx(file: BytesIO):
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
- doc = PdfReader(file)
19
  text = ""
20
- for page in doc.pages:
21
- text += page.extract_text()
 
22
  return text
23
  except Exception as e:
24
  logging.error(f"Error while processing PDF: {str(e)}")
 
1
+ import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
6
+
7
 
8
  def parse_docx(file: BytesIO):
9
  doc = docx.Document(file)
 
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
+ doc = fitz.open(stream=file, filetype="pdf")
19
  text = ""
20
+ for page_num in range(doc.page_count):
21
+ page = doc.load_page(page_num)
22
+ text += page.get_text()
23
  return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
features/text_classifier/controller.py CHANGED
@@ -60,12 +60,12 @@ async def handle_file_upload(file: UploadFile):
60
  try:
61
  file_contents = await extract_file_contents(file)
62
  if len(file_contents) > 10000:
63
- return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
64
 
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
68
- # print(f"Cleaned text: '{cleaned_text}'") # Debugging statement
69
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
70
  return {
71
  "content": file_contents,
@@ -102,15 +102,12 @@ async def handle_sentence_level_analysis(text: str):
102
  "ai_likelihood": ai_likelihood
103
  })
104
 
105
- return {"analysis": results}
106
-
107
- # Analyze each sentence from uploaded file
108
  async def handle_file_sentence(file: UploadFile):
109
  try:
110
  file_contents = await extract_file_contents(file)
111
  if len(file_contents) > 10000:
112
- # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
113
- return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
114
 
115
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
116
  if not cleaned_text:
 
60
  try:
61
  file_contents = await extract_file_contents(file)
62
  if len(file_contents) > 10000:
63
+ raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
64
 
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
68
+
69
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
70
  return {
71
  "content": file_contents,
 
102
  "ai_likelihood": ai_likelihood
103
  })
104
 
105
+ return {"analysis": results}# Analyze each sentence from uploaded file
 
 
106
  async def handle_file_sentence(file: UploadFile):
107
  try:
108
  file_contents = await extract_file_contents(file)
109
  if len(file_contents) > 10000:
110
+ raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
 
111
 
112
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
113
  if not cleaned_text:
features/text_classifier/preprocess.py CHANGED
@@ -1,4 +1,4 @@
1
- from pypdf import PdfReader
2
  import docx
3
  from io import BytesIO
4
  import logging
@@ -15,16 +15,18 @@ def parse_docx(file: BytesIO):
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
- doc = PdfReader(file)
19
  text = ""
20
- for page in doc.pages:
21
- text += page.extract_text()
22
- return text
 
23
  except Exception as e:
24
  logging.error(f"Error while processing PDF: {str(e)}")
25
  raise HTTPException(
26
  status_code=500, detail="Error processing PDF file")
27
 
 
28
  def parse_txt(file: BytesIO):
29
  return file.read().decode("utf-8")
30
 
 
1
+ import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
 
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
+ doc = fitz.open(stream=file, filetype="pdf")
19
  text = ""
20
+ for page_num in range(doc.page_count):
21
+ page = doc.load_page(page_num)
22
+ text += page.get_text()
23
+ return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
26
  raise HTTPException(
27
  status_code=500, detail="Error processing PDF file")
28
 
29
+
30
  def parse_txt(file: BytesIO):
31
  return file.read().decode("utf-8")
32
 
requirements.txt CHANGED
@@ -15,6 +15,6 @@ tensorflow
15
  opencv-python
16
  pillow
17
  scipy
18
- pypdf
19
  frontend
20
  tools
 
15
  opencv-python
16
  pillow
17
  scipy
18
+ fitz
19
  frontend
20
  tools