Rezuwan commited on
Commit
a9328e8
·
verified ·
1 Parent(s): 729d844

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+ import docx
4
+ from PIL import Image
5
+ import pytesseract
6
+ from pdf2image import convert_from_path
7
+ import fitz # PyMuPDF
8
+
9
+
10
+ def extract_text_from_txt(file_path: str) -> str:
11
+ with open(file_path, 'r', encoding='utf-8') as f:
12
+ return f.read()
13
+
14
+ def extract_text_from_md(file_path: str) -> str:
15
+ with open(file_path, 'r', encoding='utf-8') as f:
16
+ return f.read()
17
+
18
+ def extract_text_from_pdf(file_path: str) -> str:
19
+ text = ""
20
+ with fitz.open(file_path) as doc:
21
+ for page in doc:
22
+ text += page.get_text()
23
+ return text
24
+
25
+ def extract_text_from_docx(file_path: str) -> str:
26
+ doc = docx.Document(file_path)
27
+ return '\n'.join([para.text for para in doc.paragraphs])
28
+
29
+ def extract_text_from_image(file_path: str) -> str:
30
+ image = Image.open(file_path)
31
+ return pytesseract.image_to_string(image)
32
+
33
+ def extract_text_from_scanned_pdf(file_path: str) -> str:
34
+ images = convert_from_path(file_path)
35
+ text = ""
36
+ for image in images:
37
+ text += pytesseract.image_to_string(image)
38
+ return text
39
+
40
+ def extract_text(file_path: str) -> Optional[str]:
41
+ ext = os.path.splitext(file_path)[-1].lower()
42
+
43
+ if ext == '.txt':
44
+ return extract_text_from_txt(file_path)
45
+ elif ext == '.md':
46
+ return extract_text_from_md(file_path)
47
+ elif ext == '.pdf':
48
+ try:
49
+ text = extract_text_from_pdf(file_path)
50
+ if not text.strip():
51
+ text = extract_text_from_scanned_pdf(file_path)
52
+ return text
53
+ except Exception as e:
54
+ print(f"Error reading PDF: {e}")
55
+ return None
56
+ elif ext == '.docx':
57
+ return extract_text_from_docx(file_path)
58
+ elif ext in ['.jpg', '.jpeg', '.png']:
59
+ return extract_text_from_image(file_path)
60
+ else:
61
+ print(f"Unsupported file type: {ext}")
62
+ return None
requirements.txt ADDED
Binary file (312 Bytes). View file