Spaces:

sonuprasad23
/

stochastic

Runtime error

File size: 4,455 Bytes

822c114

import re
import hashlib
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

try:
    from pypdf import PdfReader
    HAS_PYPDF = True
except ImportError:
    HAS_PYPDF = False


@dataclass
class ParsedDocument:
    title: str
    full_text: str
    sections: list[dict]
    page_count: int


def extract_title(text: str, filename: str) -> str:
    lines = text.strip().split('\n')
    for line in lines[:10]:
        line = line.strip()
        if 20 < len(line) < 200 and not line.startswith(('http', 'www', 'doi')):
            return line
    return Path(filename).stem.replace('_', ' ').replace('-', ' ').title()


def detect_sections(text: str) -> list[dict]:
    section_pattern = re.compile(
        r'^(?:(\d+\.?\s*)?)(Abstract|Introduction|Background|Related Work|'
        r'Methodology|Methods|Method|Approach|Model|Architecture|'
        r'Experiments?|Results?|Discussion|Conclusion|Conclusions|'
        r'References|Acknowledgments?|Appendix)\s*$',
        re.IGNORECASE | re.MULTILINE
    )
    
    sections = []
    matches = list(section_pattern.finditer(text))
    
    if not matches:
        return [{"title": "Content", "content": text, "start": 0, "end": len(text)}]
    
    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = match.group(2).strip()
        section_content = text[start:end].strip()
        
        if section_content:
            sections.append({
                "title": section_title,
                "content": section_content,
                "start": start,
                "end": end
            })
    
    return sections if sections else [{"title": "Content", "content": text, "start": 0, "end": len(text)}]


def ingest_pdf(file_path: Path) -> Optional[ParsedDocument]:
    if not HAS_PYPDF:
        return None
    
    try:
        reader = PdfReader(str(file_path))
        pages = [page.extract_text() or "" for page in reader.pages]
        full_text = "\n\n".join(pages)
        
        if len(full_text.strip()) < 100:
            return None
        
        title = extract_title(full_text, file_path.name)
        sections = detect_sections(full_text)
        
        return ParsedDocument(
            title=title,
            full_text=full_text,
            sections=sections,
            page_count=len(pages)
        )
    except Exception:
        return None


def chunk_document(doc: ParsedDocument, paper_id: str, chunk_size: int = 2000) -> list[dict]:
    from vector_store import DocumentChunk
    
    chunks = []
    
    for section in doc.sections:
        content = section["content"]
        section_title = section["title"]
        
        if len(content) <= chunk_size:
            chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{content[:100]}".encode()).hexdigest()
            chunks.append(DocumentChunk(
                chunk_id=chunk_id,
                paper_id=paper_id,
                paper_name=doc.title,
                content=content,
                section_title=section_title
            ))
        else:
            paragraphs = content.split('\n\n')
            current_chunk = ""
            
            for para in paragraphs:
                if len(current_chunk) + len(para) <= chunk_size:
                    current_chunk += para + "\n\n"
                else:
                    if current_chunk.strip():
                        chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
                        chunks.append(DocumentChunk(
                            chunk_id=chunk_id,
                            paper_id=paper_id,
                            paper_name=doc.title,
                            content=current_chunk.strip(),
                            section_title=section_title
                        ))
                    current_chunk = para + "\n\n"
            
            if current_chunk.strip():
                chunk_id = hashlib.md5(f"{paper_id}:{section_title}:{current_chunk[:100]}".encode()).hexdigest()
                chunks.append(DocumentChunk(
                    chunk_id=chunk_id,
                    paper_id=paper_id,
                    paper_name=doc.title,
                    content=current_chunk.strip(),
                    section_title=section_title
                ))
    
    return chunks