Spaces:

prasanthr0416
/

Tarka_rag_system

Running

File size: 3,098 Bytes

f39db8f

import os
import PyPDF2
import docx
import tiktoken


def extract_text_from_pdf(file) -> tuple[str, int]:
    """Extract text from PDF and return (text, page_count)."""
    reader = PyPDF2.PdfReader(file)
    page_count = len(reader.pages)
    text = ""
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted + "\n"
    return text.strip(), page_count


def extract_text_from_docx(file) -> tuple[str, int]:
    """Extract text from DOCX and return (text, estimated_pages)."""
    doc = docx.Document(file)
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text)
    text = "\n".join(full_text)
    # Estimate pages: ~250 words per page
    word_count = len(text.split())
    estimated_pages = max(1, round(word_count / 250))
    return text.strip(), estimated_pages


def extract_text_from_txt(file) -> tuple[str, int]:
    """Extract text from TXT and return (text, estimated_pages)."""
    text = file.read().decode("utf-8", errors="ignore")
    word_count = len(text.split())
    estimated_pages = max(1, round(word_count / 250))
    return text.strip(), estimated_pages


def extract_text(file, filename: str) -> tuple[str, int]:
    """Extract text from uploaded file based on extension."""
    ext = os.path.splitext(filename)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file)
    elif ext == ".docx":
        return extract_text_from_docx(file)
    elif ext == ".txt":
        return extract_text_from_txt(file)
    else:
        raise ValueError(f"Unsupported file type: {ext}. Supported: PDF, DOCX, TXT")


def count_tokens(text: str) -> int:
    """Count tokens using tiktoken (cl100k_base encoding)."""
    try:
        enc = tiktoken.get_encoding("cl100k_base")
        return len(enc.encode(text))
    except Exception:
        # Fallback: approximate 1 token per 4 characters
        return len(text) // 4


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """Split text into overlapping chunks by word count."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return [c for c in chunks if c.strip()]


def get_document_stats(text: str, page_count: int, filename: str) -> dict:
    """Return a stats dictionary for the uploaded document."""
    word_count = len(text.split())
    char_count = len(text)
    token_count = count_tokens(text)
    sentence_count = text.count(".") + text.count("!") + text.count("?")
    avg_words_per_page = round(word_count / max(page_count, 1))

    return {
        "filename": filename,
        "pages": page_count,
        "words": word_count,
        "characters": char_count,
        "tokens": token_count,
        "sentences": sentence_count,
        "avg_words_per_page": avg_words_per_page,
        "estimated_read_time_min": max(1, round(word_count / 200)),
    }