""" Data processing utilities for loading and preparing documents """ import json import os from pathlib import Path from typing import List, Dict, Any import logging logger = logging.getLogger(__name__) class DocumentProcessor: """Process and prepare documents for summarization.""" def __init__(self): """Initialize document processor.""" self.documents = [] def load_documents(self, file_path: str) -> List[Dict[str, Any]]: """ Load documents from JSON or JSONL file. Args: file_path: Path to document file Returns: List of document dictionaries """ documents = [] try: if file_path.endswith('.json'): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, list): documents = data elif isinstance(data, dict) and 'documents' in data: documents = data['documents'] elif file_path.endswith('.jsonl'): with open(file_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): documents.append(json.loads(line)) logger.info(f"Loaded {len(documents)} documents from {file_path}") self.documents = documents return documents except Exception as e: logger.error(f"Error loading documents: {str(e)}") return [] def save_documents(self, documents: List[Dict], output_path: str) -> bool: """ Save documents to JSON file. Args: documents: List of documents output_path: Path to save documents Returns: Success status """ try: Path(output_path).parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(documents, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(documents)} documents to {output_path}") return True except Exception as e: logger.error(f"Error saving documents: {str(e)}") return False def process_batch(self, documents: List[Dict]) -> List[Dict]: """ Process a batch of documents. Args: documents: List of documents to process Returns: List of processed documents """ processed = [] for doc in documents: processed_doc = { 'id': doc.get('id', ''), 'title': doc.get('title', ''), 'abstract': doc.get('abstract', ''), 'full_text': doc.get('full_text', ''), 'sections': doc.get('sections', {}), 'word_count': len(doc.get('full_text', '').split()), 'sentence_count': len(doc.get('full_text', '').split('.')), } processed.append(processed_doc) return processed def get_statistics(self, documents: List[Dict] = None) -> Dict[str, Any]: """ Get statistics about documents. Args: documents: Documents to analyze (uses self.documents if None) Returns: Dictionary of statistics """ docs = documents or self.documents if not docs: return {} word_counts = [len(doc.get('full_text', '').split()) for doc in docs] return { 'total_documents': len(docs), 'total_words': sum(word_counts), 'average_length': sum(word_counts) / len(docs) if docs else 0, 'min_length': min(word_counts) if word_counts else 0, 'max_length': max(word_counts) if word_counts else 0, } class ArxivLoader: """Load arXiv dataset.""" @staticmethod def load_from_csv(csv_path: str) -> List[Dict]: """Load arXiv data from CSV file.""" import pandas as pd df = pd.read_csv(csv_path) documents = [] for _, row in df.iterrows(): doc = { 'id': row.get('id', ''), 'title': row.get('title', ''), 'authors': row.get('authors', '').split(';') if 'authors' in row else [], 'abstract': row.get('abstract', ''), 'categories': row.get('categories', '').split() if 'categories' in row else [], 'published_date': row.get('update_date', ''), } documents.append(doc) return documents class PubmedLoader: """Load PubMed dataset.""" @staticmethod def fetch_from_api(query: str, max_results: int = 10) -> List[Dict]: """Fetch PubMed papers via API.""" import requests base_url = "https://pubmed.ncbi.nlm.nih.gov/api/gateway/search" params = { 'term': query, 'pageSize': max_results, 'format': 'json' } try: response = requests.get(base_url, params=params, timeout=10) response.raise_for_status() data = response.json() documents = [] for paper in data.get('papers', []): doc = { 'id': paper.get('pmid', ''), 'title': paper.get('title', ''), 'abstract': paper.get('abstract', ''), 'authors': paper.get('authors', []), 'published_date': paper.get('pubdate', ''), } documents.append(doc) return documents except Exception as e: logger.error(f"Error fetching from PubMed: {str(e)}") return [] def load_sample_data() -> List[Dict]: """Load sample documents for testing.""" current_dir = Path(__file__).parent.parent sample_file = current_dir / 'sample_documents.json' if sample_file.exists(): processor = DocumentProcessor() return processor.load_documents(str(sample_file)) return []