Spaces:
Running
Running
File size: 3,098 Bytes
f39db8f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
import PyPDF2
import docx
import tiktoken
def extract_text_from_pdf(file) -> tuple[str, int]:
"""Extract text from PDF and return (text, page_count)."""
reader = PyPDF2.PdfReader(file)
page_count = len(reader.pages)
text = ""
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
return text.strip(), page_count
def extract_text_from_docx(file) -> tuple[str, int]:
"""Extract text from DOCX and return (text, estimated_pages)."""
doc = docx.Document(file)
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text)
text = "\n".join(full_text)
# Estimate pages: ~250 words per page
word_count = len(text.split())
estimated_pages = max(1, round(word_count / 250))
return text.strip(), estimated_pages
def extract_text_from_txt(file) -> tuple[str, int]:
"""Extract text from TXT and return (text, estimated_pages)."""
text = file.read().decode("utf-8", errors="ignore")
word_count = len(text.split())
estimated_pages = max(1, round(word_count / 250))
return text.strip(), estimated_pages
def extract_text(file, filename: str) -> tuple[str, int]:
"""Extract text from uploaded file based on extension."""
ext = os.path.splitext(filename)[1].lower()
if ext == ".pdf":
return extract_text_from_pdf(file)
elif ext == ".docx":
return extract_text_from_docx(file)
elif ext == ".txt":
return extract_text_from_txt(file)
else:
raise ValueError(f"Unsupported file type: {ext}. Supported: PDF, DOCX, TXT")
def count_tokens(text: str) -> int:
"""Count tokens using tiktoken (cl100k_base encoding)."""
try:
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))
except Exception:
# Fallback: approximate 1 token per 4 characters
return len(text) // 4
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks by word count."""
words = text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk = " ".join(words[start:end])
chunks.append(chunk)
start += chunk_size - overlap
return [c for c in chunks if c.strip()]
def get_document_stats(text: str, page_count: int, filename: str) -> dict:
"""Return a stats dictionary for the uploaded document."""
word_count = len(text.split())
char_count = len(text)
token_count = count_tokens(text)
sentence_count = text.count(".") + text.count("!") + text.count("?")
avg_words_per_page = round(word_count / max(page_count, 1))
return {
"filename": filename,
"pages": page_count,
"words": word_count,
"characters": char_count,
"tokens": token_count,
"sentences": sentence_count,
"avg_words_per_page": avg_words_per_page,
"estimated_read_time_min": max(1, round(word_count / 200)),
}
|