Spaces:
Running
Running
| import os | |
| import PyPDF2 | |
| import docx | |
| import tiktoken | |
| def extract_text_from_pdf(file) -> tuple[str, int]: | |
| """Extract text from PDF and return (text, page_count).""" | |
| reader = PyPDF2.PdfReader(file) | |
| page_count = len(reader.pages) | |
| text = "" | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted + "\n" | |
| return text.strip(), page_count | |
| def extract_text_from_docx(file) -> tuple[str, int]: | |
| """Extract text from DOCX and return (text, estimated_pages).""" | |
| doc = docx.Document(file) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| full_text.append(para.text) | |
| text = "\n".join(full_text) | |
| # Estimate pages: ~250 words per page | |
| word_count = len(text.split()) | |
| estimated_pages = max(1, round(word_count / 250)) | |
| return text.strip(), estimated_pages | |
| def extract_text_from_txt(file) -> tuple[str, int]: | |
| """Extract text from TXT and return (text, estimated_pages).""" | |
| text = file.read().decode("utf-8", errors="ignore") | |
| word_count = len(text.split()) | |
| estimated_pages = max(1, round(word_count / 250)) | |
| return text.strip(), estimated_pages | |
| def extract_text(file, filename: str) -> tuple[str, int]: | |
| """Extract text from uploaded file based on extension.""" | |
| ext = os.path.splitext(filename)[1].lower() | |
| if ext == ".pdf": | |
| return extract_text_from_pdf(file) | |
| elif ext == ".docx": | |
| return extract_text_from_docx(file) | |
| elif ext == ".txt": | |
| return extract_text_from_txt(file) | |
| else: | |
| raise ValueError(f"Unsupported file type: {ext}. Supported: PDF, DOCX, TXT") | |
| def count_tokens(text: str) -> int: | |
| """Count tokens using tiktoken (cl100k_base encoding).""" | |
| try: | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| return len(enc.encode(text)) | |
| except Exception: | |
| # Fallback: approximate 1 token per 4 characters | |
| return len(text) // 4 | |
| def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]: | |
| """Split text into overlapping chunks by word count.""" | |
| words = text.split() | |
| chunks = [] | |
| start = 0 | |
| while start < len(words): | |
| end = start + chunk_size | |
| chunk = " ".join(words[start:end]) | |
| chunks.append(chunk) | |
| start += chunk_size - overlap | |
| return [c for c in chunks if c.strip()] | |
| def get_document_stats(text: str, page_count: int, filename: str) -> dict: | |
| """Return a stats dictionary for the uploaded document.""" | |
| word_count = len(text.split()) | |
| char_count = len(text) | |
| token_count = count_tokens(text) | |
| sentence_count = text.count(".") + text.count("!") + text.count("?") | |
| avg_words_per_page = round(word_count / max(page_count, 1)) | |
| return { | |
| "filename": filename, | |
| "pages": page_count, | |
| "words": word_count, | |
| "characters": char_count, | |
| "tokens": token_count, | |
| "sentences": sentence_count, | |
| "avg_words_per_page": avg_words_per_page, | |
| "estimated_read_time_min": max(1, round(word_count / 200)), | |
| } | |