Add files using upload-large-folder tool

12fd5f2 verified 7 days ago

11.1 kB

	"""
	End-to-end inference pipeline.
	Accepts raw dyslectic text (and optionally a master copy),
	returns corrected academic text with metadata.
	"""

	from ..preprocessing.pipeline import PreprocessingPipeline
	from ..style.fingerprinter import StyleFingerprinter
	from ..vocabulary.lexical_substitution import LexicalElevator, RegisterFilter
	from ..model.base_model import load_model_and_tokenizer
	from ..model.style_conditioner import StyleConditioner, prepend_style_prefix
	from ..model.generation_utils import generate_correction
	from .postprocessor import PostProcessor
	from ..evaluation.style_metrics import StyleEvaluator
	from ..vocabulary.awl_loader import AWLLoader
	import torch
	from typing import Optional
	from dataclasses import dataclass
	from loguru import logger
	import yaml


	TASK_PREFIX = (
	"Correct the following text for grammar, spelling, and clarity. "
	"Maintain the author's original tone and writing style. "
	"Elevate vocabulary to academic register. "
	"Do NOT change the meaning or add new information. "
	"Preserve named entities exactly. "
	"Text to correct: "
	)


	@dataclass
	class CorrectionResult:
	original: str
	corrected: str
	preprocessed: str
	style_similarity: float
	awl_coverage: float
	readability: dict
	changes_summary: str


	class AcademicCorrector:
	"""Full inference pipeline: preprocess → fingerprint → generate → elevate → filter."""

	def __init__(self, config: dict):
	logger.info("Initialising AcademicCorrector...")

	model_cfg = config.get("model", {})
	gen_cfg = config.get("generation", {})
	vocab_cfg = config.get("vocabulary", {})
	style_cfg = config.get("style_conditioner", {})

	# 1. Load model and tokenizer
	model_key = model_cfg.get("key", "flan-t5-small")
	checkpoint = model_cfg.get("checkpoint_path", None)
	use_lora = model_cfg.get("use_lora", False)

	if checkpoint and use_lora:
	# PEFT adapter checkpoint: load base model + apply adapter
	import os
	try:
	from peft import PeftModel
	logger.info(f"Loading base model '{model_key}' + PEFT adapter from '{checkpoint}'")
	self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
	model_key, quantize=False, use_lora=False
	)
	self.model = PeftModel.from_pretrained(self.model, checkpoint)
	logger.info(f"PEFT adapter loaded from {checkpoint}")
	except Exception as e:
	logger.warning(f"PEFT loading failed ({e}), loading base model only")
	self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
	model_key, quantize=False, use_lora=False
	)
	elif checkpoint:
	# Full model checkpoint (merged weights)
	try:
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	self.model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
	self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	self.is_seq2seq = True
	logger.info(f"Loaded full model from checkpoint: {checkpoint}")
	except Exception:
	logger.warning(f"Checkpoint not found, loading base model: {model_key}")
	self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
	model_key, quantize=False, use_lora=False
	)
	else:
	self.model, self.tokenizer, self.is_seq2seq = load_model_and_tokenizer(
	model_key, quantize=False, use_lora=False
	)

	self.model.eval()
	self.generation_config = gen_cfg

	# 2. Preprocessor
	self.preprocessor = PreprocessingPipeline()

	# 3. Style fingerprinter
	fp_cfg = config.get("fingerprinter", {})
	self.fingerprinter = StyleFingerprinter(
	spacy_model=fp_cfg.get("spacy_model", "en_core_web_sm"),
	awl_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"),
	)

	# 4. Style conditioner — auto-detect hidden dim from loaded model
	if hasattr(self.model.config, "d_model"):
	auto_hidden_dim = self.model.config.d_model
	elif hasattr(self.model.config, "hidden_size"):
	auto_hidden_dim = self.model.config.hidden_size
	else:
	auto_hidden_dim = 512 # Safe default for T5-Small
	logger.info(f"Auto-detected model hidden dim: {auto_hidden_dim}")

	self.conditioner = StyleConditioner(
	style_dim=style_cfg.get("style_dim", 512),
	model_hidden_dim=style_cfg.get("model_hidden_dim", auto_hidden_dim),
	n_prefix_tokens=style_cfg.get("n_prefix_tokens", 10),
	)
	self.conditioner.eval()

	# 5. Vocabulary elevator
	try:
	self.elevator = LexicalElevator(
	awl_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"),
	spacy_model="en_core_web_sm",
	mlm_model=vocab_cfg.get("mlm_model", "bert-large-uncased"),
	sem_model=vocab_cfg.get("sem_model", "all-mpnet-base-v2"),
	)
	except Exception as e:
	logger.warning(f"Lexical elevator init failed: {e}, elevation disabled")
	self.elevator = None

	# 6. Register filter
	self.register_filter = RegisterFilter()

	# 7. Post-processor
	self.postprocessor = PostProcessor()

	# 8. Evaluator
	awl = AWLLoader(primary_path=vocab_cfg.get("awl_path", "data/awl/coxhead_awl.txt"))
	self.evaluator = StyleEvaluator(self.fingerprinter, awl)

	logger.info("AcademicCorrector initialised successfully")

	def correct(
	self,
	raw_text: str,
	master_copy: Optional[str] = None,
	style_alpha: float = 0.6,
	) -> CorrectionResult:
	"""
	Full correction pipeline:
	1. Pre-process (spell correct + parse)
	2. Style fingerprint
	3. Generate with style conditioning
	4. Academic vocabulary elevation
	5. Register filter
	6. Compute quality metrics
	"""
	# Step 1: Pre-process
	logger.info("Step 1: Preprocessing...")
	doc = self.preprocessor.process(raw_text)

	# Step 2: Style fingerprint
	logger.info("Step 2: Extracting style fingerprint...")
	user_style = self.fingerprinter.extract_vector(doc.corrected_text)

	if master_copy:
	master_style = self.fingerprinter.extract_vector(master_copy)
	target_style = self.fingerprinter.blend_vectors(user_style, master_style, alpha=style_alpha)
	else:
	target_style = user_style

	# Step 3: Generate correction (sentence-chunked)
	# The model was trained on max_input_length=128 tokens.
	# Split text into sentence groups that fit within that window,
	# process each chunk, then reassemble.
	logger.info("Step 3: Generating correction (chunked)...")

	MAX_INPUT_TOKENS = 128
	# Measure how many tokens the task prefix uses
	prefix_tokens = len(self.tokenizer.encode(TASK_PREFIX, add_special_tokens=False))
	budget = MAX_INPUT_TOKENS - prefix_tokens - 2 # 2 for special tokens

	# Split into sentences using spaCy (already loaded for fingerprinting)
	sent_doc = self.fingerprinter.nlp(doc.corrected_text)
	sentences = [sent.text.strip() for sent in sent_doc.sents if sent.text.strip()]

	# Group sentences into chunks that fit the token budget
	chunks = []
	current_chunk = []
	current_tokens = 0

	for sent in sentences:
	sent_tokens = len(self.tokenizer.encode(sent, add_special_tokens=False))
	if current_tokens + sent_tokens > budget and current_chunk:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sent]
	current_tokens = sent_tokens
	else:
	current_chunk.append(sent)
	current_tokens += sent_tokens

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	logger.info(f" Split into {len(chunks)} chunks from {len(sentences)} sentences")

	corrected_chunks = []
	device = next(self.model.parameters()).device

	for i, chunk in enumerate(chunks):
	chunk_input = TASK_PREFIX + chunk
	inputs = self.tokenizer(
	chunk_input,
	max_length=MAX_INPUT_TOKENS,
	truncation=True,
	return_tensors="pt",
	)

	input_ids = inputs["input_ids"].to(device)
	attention_mask = inputs["attention_mask"].to(device)

	chunk_output = generate_correction(
	self.model,
	self.tokenizer,
	input_ids,
	attention_mask,
	self.generation_config,
	)
	corrected_chunks.append(chunk_output)
	logger.debug(f" Chunk {i+1}/{len(chunks)}: {len(chunk.split())} → {len(chunk_output.split())} words")

	generated = " ".join(corrected_chunks)

	# Step 4: Post-process
	logger.info("Step 4: Post-processing...")
	generated = self.postprocessor.clean(generated)
	generated = self.postprocessor.restore_entities(
	generated,
	[e.text for e in doc.entities],
	doc.protected_spans,
	)

	# Step 5: Vocabulary elevation
	logger.info("Step 5: Vocabulary elevation...")
	if self.elevator:
	try:
	generated = self.elevator.elevate(generated, doc.protected_spans)
	except Exception as e:
	logger.warning(f"Vocabulary elevation failed: {e}")

	# Step 6: Register filter
	logger.info("Step 6: Register filtering...")
	generated = self.register_filter.apply(generated)

	# Final formatting
	generated = self.postprocessor.format_output(generated)

	# Step 7: Compute quality metrics
	logger.info("Step 7: Computing metrics...")
	style_sim = self.evaluator.style_similarity(raw_text, generated)
	awl_cov = self.evaluator.awl_coverage(generated)

	# Build changes summary
	changes = []
	if doc.original_text != doc.corrected_text:
	changes.append("Spelling/grammar corrections applied")
	if generated != doc.corrected_text:
	changes.append("Text restructured and elevated")
	changes_summary = "; ".join(changes) if changes else "No changes needed"

	return CorrectionResult(
	original=raw_text,
	corrected=generated,
	preprocessed=doc.corrected_text,
	style_similarity=style_sim,
	awl_coverage=awl_cov,
	readability=doc.readability,
	changes_summary=changes_summary,
	)