Pujan-Dev commited on
Commit
9c8da7e
·
1 Parent(s): b7c5baf

Added the changes for the python pdf reader

Browse files
.env-example DELETED
@@ -1,2 +0,0 @@
1
- MY_SECRET_TOKEN="SECRET_CODE_TOKEN"
2
-
 
 
 
features/nepali_text_classifier/preprocess.py CHANGED
@@ -1,9 +1,9 @@
1
- import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
6
-
7
 
8
  def parse_docx(file: BytesIO):
9
  doc = docx.Document(file)
@@ -15,11 +15,11 @@ def parse_docx(file: BytesIO):
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
- doc = fitz.open(stream=file, filetype="pdf")
19
  text = ""
20
- for page_num in range(doc.page_count):
21
- page = doc.load_page(page_num)
22
- text += page.get_text()
23
  return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
 
1
+ # import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
5
  from fastapi import HTTPException
6
+ from pypdf import PdfReader
7
 
8
  def parse_docx(file: BytesIO):
9
  doc = docx.Document(file)
 
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
+ doc = PdfReader(file)
19
  text = ""
20
+ for page in doc.pages:
21
+ text += page.extract_text()
22
+ # print(text)
23
  return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
features/text_classifier/controller.py CHANGED
@@ -60,12 +60,12 @@ async def handle_file_upload(file: UploadFile):
60
  try:
61
  file_contents = await extract_file_contents(file)
62
  if len(file_contents) > 10000:
63
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
64
 
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
68
-
69
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
70
  return {
71
  "content": file_contents,
@@ -102,12 +102,15 @@ async def handle_sentence_level_analysis(text: str):
102
  "ai_likelihood": ai_likelihood
103
  })
104
 
105
- return {"analysis": results}# Analyze each sentence from uploaded file
 
 
106
  async def handle_file_sentence(file: UploadFile):
107
  try:
108
  file_contents = await extract_file_contents(file)
109
  if len(file_contents) > 10000:
110
- raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
 
111
 
112
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
113
  if not cleaned_text:
 
60
  try:
61
  file_contents = await extract_file_contents(file)
62
  if len(file_contents) > 10000:
63
+ return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
64
 
65
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
66
  if not cleaned_text:
67
  raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
68
+ print(f"Cleaned text: '{cleaned_text}'") # Debugging statement
69
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
70
  return {
71
  "content": file_contents,
 
102
  "ai_likelihood": ai_likelihood
103
  })
104
 
105
+ return {"analysis": results}
106
+
107
+ # Analyze each sentence from uploaded file
108
  async def handle_file_sentence(file: UploadFile):
109
  try:
110
  file_contents = await extract_file_contents(file)
111
  if len(file_contents) > 10000:
112
+ # raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
113
+ return {"status_code": 413, "detail": "Text must be less than 10,000 characters"}
114
 
115
  cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
116
  if not cleaned_text:
features/text_classifier/preprocess.py CHANGED
@@ -1,4 +1,4 @@
1
- import fitz # PyMuPDF
2
  import docx
3
  from io import BytesIO
4
  import logging
@@ -15,18 +15,17 @@ def parse_docx(file: BytesIO):
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
- doc = fitz.open(stream=file, filetype="pdf")
19
  text = ""
20
- for page_num in range(doc.page_count):
21
- page = doc.load_page(page_num)
22
- text += page.get_text()
23
- return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
26
  raise HTTPException(
27
  status_code=500, detail="Error processing PDF file")
28
 
29
-
30
  def parse_txt(file: BytesIO):
31
  return file.read().decode("utf-8")
32
 
 
1
+ from pypdf import PdfReader
2
  import docx
3
  from io import BytesIO
4
  import logging
 
15
 
16
  def parse_pdf(file: BytesIO):
17
  try:
18
+ doc = PdfReader(file)
19
  text = ""
20
+ for page in doc.pages:
21
+ text += page.extract_text()
22
+ # print(text)
23
+ return text
24
  except Exception as e:
25
  logging.error(f"Error while processing PDF: {str(e)}")
26
  raise HTTPException(
27
  status_code=500, detail="Error processing PDF file")
28
 
 
29
  def parse_txt(file: BytesIO):
30
  return file.read().decode("utf-8")
31
 
requirements.txt CHANGED
@@ -15,6 +15,6 @@ tensorflow
15
  opencv-python
16
  pillow
17
  scipy
18
- fitz
19
  frontend
20
  tools
 
15
  opencv-python
16
  pillow
17
  scipy
18
+ pypdf
19
  frontend
20
  tools