Spaces:

prasanthr0416
/

Tarka_rag_system

Running

App Files Files Community

Tarka_rag_system / document_processor.py

prasanthr0416

Upload 4 files

f39db8f verified 17 days ago

raw

history blame contribute delete

3.1 kB

	import os
	import PyPDF2
	import docx
	import tiktoken


	def extract_text_from_pdf(file) -> tuple[str, int]:
	"""Extract text from PDF and return (text, page_count)."""
	reader = PyPDF2.PdfReader(file)
	page_count = len(reader.pages)
	text = ""
	for page in reader.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"
	return text.strip(), page_count


	def extract_text_from_docx(file) -> tuple[str, int]:
	"""Extract text from DOCX and return (text, estimated_pages)."""
	doc = docx.Document(file)
	full_text = []
	for para in doc.paragraphs:
	if para.text.strip():
	full_text.append(para.text)
	text = "\n".join(full_text)
	# Estimate pages: ~250 words per page
	word_count = len(text.split())
	estimated_pages = max(1, round(word_count / 250))
	return text.strip(), estimated_pages


	def extract_text_from_txt(file) -> tuple[str, int]:
	"""Extract text from TXT and return (text, estimated_pages)."""
	text = file.read().decode("utf-8", errors="ignore")
	word_count = len(text.split())
	estimated_pages = max(1, round(word_count / 250))
	return text.strip(), estimated_pages


	def extract_text(file, filename: str) -> tuple[str, int]:
	"""Extract text from uploaded file based on extension."""
	ext = os.path.splitext(filename)[1].lower()
	if ext == ".pdf":
	return extract_text_from_pdf(file)
	elif ext == ".docx":
	return extract_text_from_docx(file)
	elif ext == ".txt":
	return extract_text_from_txt(file)
	else:
	raise ValueError(f"Unsupported file type: {ext}. Supported: PDF, DOCX, TXT")


	def count_tokens(text: str) -> int:
	"""Count tokens using tiktoken (cl100k_base encoding)."""
	try:
	enc = tiktoken.get_encoding("cl100k_base")
	return len(enc.encode(text))
	except Exception:
	# Fallback: approximate 1 token per 4 characters
	return len(text) // 4


	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
	"""Split text into overlapping chunks by word count."""
	words = text.split()
	chunks = []
	start = 0
	while start < len(words):
	end = start + chunk_size
	chunk = " ".join(words[start:end])
	chunks.append(chunk)
	start += chunk_size - overlap
	return [c for c in chunks if c.strip()]


	def get_document_stats(text: str, page_count: int, filename: str) -> dict:
	"""Return a stats dictionary for the uploaded document."""
	word_count = len(text.split())
	char_count = len(text)
	token_count = count_tokens(text)
	sentence_count = text.count(".") + text.count("!") + text.count("?")
	avg_words_per_page = round(word_count / max(page_count, 1))

	return {
	"filename": filename,
	"pages": page_count,
	"words": word_count,
	"characters": char_count,
	"tokens": token_count,
	"sentences": sentence_count,
	"avg_words_per_page": avg_words_per_page,
	"estimated_read_time_min": max(1, round(word_count / 200)),
	}