Update README.md
Browse files
README.md
CHANGED
|
@@ -46,17 +46,404 @@ The end-to-end workflow—**Phase 1: compression + indexing, Phase 2: retrieval
|
|
| 46 |
|
| 47 |
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
-
**
|
|
|
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
**APA:**
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
## Glossary [optional]
|
| 62 |
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
|
| 49 |
+
# Comp4Cls — Full Usage Guide (vLLM + Qwen3-4B)
|
| 50 |
|
| 51 |
+
This guide shows how to run **all three stages** of Comp4Cls with vLLM:
|
| 52 |
+
1) **Entity Extraction** → 2) **Compression** → 3) **Classification**.
|
| 53 |
|
| 54 |
+
It uses your **exact prompt templates** for each stage and a minimal vLLM wrapper.
|
| 55 |
+
Replace the model name with your fine-tuned repo if needed.
|
| 56 |
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## 0) Install & Setup
|
| 60 |
+
|
| 61 |
+
```bash
|
| 62 |
+
pip install vllm "transformers>=4.44" accelerate einops huggingface-hub
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## 1) Minimal Inference Primitives
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
import os, re, json
|
| 71 |
+
from typing import Optional, List, Dict
|
| 72 |
+
|
| 73 |
+
from vllm import LLM, SamplingParams
|
| 74 |
+
from transformers import AutoTokenizer
|
| 75 |
+
|
| 76 |
+
# ----------------------
|
| 77 |
+
# Config
|
| 78 |
+
# ----------------------
|
| 79 |
+
MODEL_NAME = "comp4cls/comp4cls-4B"
|
| 80 |
+
|
| 81 |
+
# Generation params (Stage-3 uses stop at </answer>)
|
| 82 |
+
GEN_COMMON = SamplingParams(
|
| 83 |
+
temperature=0.2,
|
| 84 |
+
top_p=0.8,
|
| 85 |
+
repetition_penalty=1.1,
|
| 86 |
+
frequency_penalty=0.1,
|
| 87 |
+
presence_penalty=0.1,
|
| 88 |
+
max_tokens=2048,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
GEN_CLASSIFICATION = SamplingParams(**{**GEN_COMMON.__dict__['_asdict']() if hasattr(GEN_COMMON, '_asdict') else {}}, stop=["</answer>"]) \
|
| 92 |
+
if hasattr(GEN_COMMON, '_asdict') else SamplingParams(
|
| 93 |
+
temperature=GEN_COMMON.temperature,
|
| 94 |
+
top_p=GEN_COMMON.top_p,
|
| 95 |
+
repetition_penalty=GEN_COMMON.repetition_penalty,
|
| 96 |
+
frequency_penalty=GEN_COMMON.frequency_penalty,
|
| 97 |
+
presence_penalty=GEN_COMMON.presence_penalty,
|
| 98 |
+
max_tokens=GEN_COMMON.max_tokens,
|
| 99 |
+
stop=["</answer>"]
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# ----------------------
|
| 103 |
+
# Load tokenizer & model
|
| 104 |
+
# ----------------------
|
| 105 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 106 |
+
llm = LLM(
|
| 107 |
+
model=MODEL_NAME,
|
| 108 |
+
trust_remote_code=True,
|
| 109 |
+
tensor_parallel_size=1,
|
| 110 |
+
gpu_memory_utilization=0.95,
|
| 111 |
+
max_model_len=30000,
|
| 112 |
+
max_num_seqs=64,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# ----------------------
|
| 116 |
+
# Helpers
|
| 117 |
+
# ----------------------
|
| 118 |
+
def apply_chat_template(prompt: str, enable_thinking: bool=False) -> str:
|
| 119 |
+
"""Wrap raw prompt with the model's chat template."""
|
| 120 |
+
messages = [{"role": "user", "content": prompt}]
|
| 121 |
+
return tokenizer.apply_chat_template(
|
| 122 |
+
messages,
|
| 123 |
+
tokenize=False,
|
| 124 |
+
add_generation_prompt=True,
|
| 125 |
+
enable_thinking=enable_thinking,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def generate_text(prompt: str, params: SamplingParams) -> str:
|
| 129 |
+
"""Single-pass generation with vLLM."""
|
| 130 |
+
formatted = apply_chat_template(prompt, enable_thinking=False)
|
| 131 |
+
out = llm.generate([formatted], params)
|
| 132 |
+
text = out[0].outputs[0].text
|
| 133 |
+
return text
|
| 134 |
+
|
| 135 |
+
def parse_json_object(text: str) -> dict:
|
| 136 |
+
"""Extract the first top-level JSON object from text and parse it."""
|
| 137 |
+
start = text.find("{")
|
| 138 |
+
end = text.rfind("}") + 1
|
| 139 |
+
if start == -1 or end == 0:
|
| 140 |
+
raise ValueError("No JSON object detected in model output.")
|
| 141 |
+
return json.loads(text[start:end])
|
| 142 |
+
|
| 143 |
+
def parse_answer_ids(text: str) -> Optional[List[Dict[str, int]]]:
|
| 144 |
+
"""Extract class IDs from <answer> ... </answer> block: [{'class_id': 123}, ...]."""
|
| 145 |
+
try:
|
| 146 |
+
m = re.search(r'<answer>(.*?)</answer>', text, re.DOTALL)
|
| 147 |
+
if not m:
|
| 148 |
+
return None
|
| 149 |
+
body = m.group(1).strip()
|
| 150 |
+
if body.lower() == "none":
|
| 151 |
+
return []
|
| 152 |
+
body = body.strip().strip('[]')
|
| 153 |
+
classes = []
|
| 154 |
+
for mm in re.finditer(r'\((\d+)\)', body):
|
| 155 |
+
classes.append({"class_id": int(mm.group(1))})
|
| 156 |
+
if not classes and body:
|
| 157 |
+
parts = [x.strip() for x in body.split(",")]
|
| 158 |
+
for p in parts:
|
| 159 |
+
if p.isdigit():
|
| 160 |
+
classes.append({"class_id": int(p)})
|
| 161 |
+
return classes if classes else []
|
| 162 |
+
except Exception:
|
| 163 |
+
return None
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## 2) Stage 1 — Entity Extraction
|
| 169 |
+
|
| 170 |
+
**Prompt (exact as provided):**
|
| 171 |
+
|
| 172 |
+
```python
|
| 173 |
+
prompt_template_entity_extraction = """You are tasked with extracting keywords from scientific literature abstracts based on their domain classification.
|
| 174 |
+
Extract keywords that appear EXACTLY in the given abstract and organize them into 7 predefined keyword types.
|
| 175 |
+
Instructions:
|
| 176 |
+
1. Read the provided abstract and domain classification carefully
|
| 177 |
+
2. Extract keywords/phrases that appear verbatim in the abstract
|
| 178 |
+
3. Organize each keyword into the most appropriate keyword type
|
| 179 |
+
4. Each keyword should be assigned to only one type
|
| 180 |
+
5. Focus on meaningful technical terms, not common words
|
| 181 |
+
6. Return results in JSON format
|
| 182 |
+
Keyword Types for Organization:
|
| 183 |
+
1. core_concepts: Central theories, main ideas, or fundamental concepts that define the research
|
| 184 |
+
2. methodologies: Research methods, experimental techniques, analytical approaches, or procedural strategies
|
| 185 |
+
3. subjects_problems: Research subjects, target problems, phenomena under investigation, or challenges being addressed
|
| 186 |
+
4. findings_impacts: Key discoveries, results, outcomes, implications, or impacts of the research
|
| 187 |
+
5. theoretical_framework: Underlying theories, models, principles, or conceptual foundations
|
| 188 |
+
6. quantitative_metrics: Numerical values, measurements, statistics, percentages, or any quantifiable data
|
| 189 |
+
7. contextual_background: Historical context, motivation, prior work references, or situational background
|
| 190 |
+
Guidelines:
|
| 191 |
+
- Extract only words/phrases that exist exactly in the abstract
|
| 192 |
+
- Prefer technical terms over generic academic vocabulary
|
| 193 |
+
- Include both single words and meaningful phrases
|
| 194 |
+
- For quantitative metrics, include the complete value with units
|
| 195 |
+
- Ensure keywords are relevant to the domain classification Output must be in JSON format with all 7 keyword types as keys.
|
| 196 |
+
Example output format: {{ "core_concepts": ["CEST MRI", "thermally activated delayed fluorescence", "blue phosphorescent organic light-emitting diodes"], "methodologies": ["synthesized", "subspace-based spectral signal decomposition", "sphere formation assay"], "subjects_problems": ["z-spectrum analysis", "cancer stem cells", "charge balance"], "findings_impacts": ["high quantum efficiency", "inhibits mobility", "record high"], "theoretical_framework": ["saturation transfer phenomena", "energy transfer", "structure-property relationship"], "quantitative_metrics": ["Above 30%", "24.2%", "70-110 GHz", "40-80 μM"], "contextual_background": ["drug resistance", "alternative to conventional", "for molecular MRI"] }}
|
| 197 |
+
Extract keywords from the following scientific literature:
|
| 198 |
+
Abstract: {abstract}
|
| 199 |
+
Return the keywords organized by their types in JSON format with all 7 keyword types.
|
| 200 |
+
"""
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
```python
|
| 204 |
+
# Example input (replace with your real abstract)
|
| 205 |
+
abstract = "We present a novel lithium-sulfur battery cathode design using porous carbon hosts..."
|
| 206 |
+
|
| 207 |
+
entity_prompt = prompt_template_entity_extraction.format(abstract=abstract)
|
| 208 |
+
entity_output = generate_text(entity_prompt, GEN_COMMON)
|
| 209 |
+
entities = parse_json_object(entity_output) # dict with 7 keys
|
| 210 |
+
print(json.dumps(entities, indent=2, ensure_ascii=False))
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
---
|
| 214 |
+
|
| 215 |
+
## 3) Stage 2 — Compression
|
| 216 |
+
|
| 217 |
+
**Prompt (exact as provided):**
|
| 218 |
+
|
| 219 |
+
```python
|
| 220 |
+
prompt_template_compression = """You are a scientific document summarizer specializing in category-driven summarization.
|
| 221 |
+
|
| 222 |
+
Task: Create a concise summary using ONLY {max_items} items from the provided semantic categories (out of {total_items} total items).
|
| 223 |
+
|
| 224 |
+
Requirements:
|
| 225 |
+
- Write the summary in the same language as the original text
|
| 226 |
+
- Select the {max_items} most relevant items that align with the original text
|
| 227 |
+
- Use content from the original text ONLY when it directly supports these categories
|
| 228 |
+
- The summary should read as if the original text was written to illustrate the semantic categories
|
| 229 |
+
- Maintain scientific accuracy and use precise terminology
|
| 230 |
+
- Ensure logical flow and coherence between concepts
|
| 231 |
+
|
| 232 |
+
Input:
|
| 233 |
+
- Original Text: {text}
|
| 234 |
+
- Semantic Categories (in order of priority): {categories}
|
| 235 |
+
|
| 236 |
+
CRITICAL: You MUST output ONLY a valid JSON object in exactly this format:
|
| 237 |
+
{{"response": "Your concise summary here"}}
|
| 238 |
+
|
| 239 |
+
Do not include any text before or after the JSON object. The summary should be a single continuous text without line breaks.
|
| 240 |
+
|
| 241 |
+
Output Format (example):
|
| 242 |
+
{{"response": "This research focuses on developing novel battery materials using advanced synthesis methods, achieving significant improvements in energy density and cycle stability through optimized electrode design."}}
|
| 243 |
+
"""
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
```python
|
| 247 |
+
# Choose how many items you want to keep
|
| 248 |
+
max_items = 10
|
| 249 |
+
categories = list(entities.keys()) # ["core_concepts", "methodologies", ...]
|
| 250 |
+
total_items = sum(len(v) for v in entities.values())
|
| 251 |
+
|
| 252 |
+
compression_prompt = prompt_template_compression.format(
|
| 253 |
+
max_items=max_items,
|
| 254 |
+
total_items=total_items,
|
| 255 |
+
text=abstract,
|
| 256 |
+
categories=categories,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
compression_output = generate_text(compression_prompt, GEN_COMMON)
|
| 260 |
+
compressed = parse_json_object(compression_output)["response"]
|
| 261 |
+
print("Compressed summary:", compressed)
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
---
|
| 265 |
+
|
| 266 |
+
## 4) Stage 3 — Classification (Patent-focused)
|
| 267 |
+
|
| 268 |
+
**Prompt (exact as provided):**
|
| 269 |
+
|
| 270 |
+
```python
|
| 271 |
+
prompt_template_classification = """You are a text classification expert specializing in patent documents.
|
| 272 |
+
You are given a JSON record for a target patent and a set of Retrieved Similar Items.
|
| 273 |
+
Your task is to assign one or more class labels to a given target patent using the provided examples as guidance.
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
**Step-by-Step Instructions:**
|
| 278 |
+
|
| 279 |
+
1. **Analyze Target and Retrieved Examples:**
|
| 280 |
+
- Review each example, paying attention to the class label and how the text reflects it.
|
| 281 |
+
- Focus on technical innovation, claims, and patent-specific terminology.
|
| 282 |
+
|
| 283 |
+
2. **Similarity Scoring (1–5):**
|
| 284 |
+
For each Retrieved Similar Item, score along three dimensions and sum to 1–5:
|
| 285 |
+
- Domain (0–2):
|
| 286 |
+
- 2: Same primary technology field
|
| 287 |
+
- 1: Closely related technology
|
| 288 |
+
- 0: Unrelated
|
| 289 |
+
- Innovation Type (0–2):
|
| 290 |
+
- 2: Same type of innovation (e.g., device, method, composition)
|
| 291 |
+
- 1: Partial overlap in innovation approach
|
| 292 |
+
- 0: Different innovation type
|
| 293 |
+
- Application/Material (0–1):
|
| 294 |
+
- 1: Shares key technical terms or entities
|
| 295 |
+
- 0: Different application/material
|
| 296 |
+
|
| 297 |
+
3. **Total Score → Similarity Label:**
|
| 298 |
+
- 5: Fully similar (Domain=2 + Innovation=2 + Application=1)
|
| 299 |
+
- 4: Mostly similar (sum = 4)
|
| 300 |
+
- 3: Partially similar (sum = 3)
|
| 301 |
+
- 2: Little similarity (sum = 2)
|
| 302 |
+
- 1: Irrelevant (sum = 0 or 1)
|
| 303 |
+
|
| 304 |
+
4. **Make a Classification Decision:**
|
| 305 |
+
- Based on all retrieved items, assign the most appropriate class ID(s) to the target.
|
| 306 |
+
|
| 307 |
+
---
|
| 308 |
+
|
| 309 |
+
**Response Format:**
|
| 310 |
+
|
| 311 |
+
1. **Chain-of-Thought** (between `<begin_of_thought>` and `<end_of_thought>`):
|
| 312 |
+
- Summarize the target's core innovation, claims, and technical field.
|
| 313 |
+
- For each Retrieved Similar Item, analyze its similarity and assign score.
|
| 314 |
+
- Conclude with overall comparison.
|
| 315 |
+
|
| 316 |
+
2. **Final Answer:**
|
| 317 |
+
- Provide classification with brief justification.
|
| 318 |
+
- Output ONLY the list of class id values.
|
| 319 |
+
|
| 320 |
+
**Use exactly this structure and STOP immediately after </answer>:**
|
| 321 |
+
```
|
| 322 |
+
<begin_of_thought>
|
| 323 |
+
<p>Target patent analysis... </p>
|
| 324 |
+
<p>Reference[Item ID=...], [Similarity=...], judgment text</p>
|
| 325 |
+
...
|
| 326 |
+
<end_of_thought>
|
| 327 |
+
<solution>Overall evaluation=...</solution>
|
| 328 |
+
<answer>[Class_label_ID_1, Class_label_ID_2, ...]</answer>
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
**CRITICAL: Your response MUST end with </answer>. Do not add any text after the closing </answer> tag.**
|
| 332 |
+
|
| 333 |
+
---
|
| 334 |
+
|
| 335 |
+
**Special Condition:**
|
| 336 |
+
- If Total Score ≤ 2:
|
| 337 |
+
- `<solution>`: Cannot determine answer
|
| 338 |
+
- `<answer>`: None
|
| 339 |
+
- Otherwise:
|
| 340 |
+
- `<solution>`: Overall evaluation=...
|
| 341 |
+
- `<answer>`: [<Class_label_ID_1>, <Class_label_ID_2>, ...]
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
**Input Data:**
|
| 346 |
+
|
| 347 |
+
- Target ID: {target_id}
|
| 348 |
+
|
| 349 |
+
- Target Text: {target_text}
|
| 350 |
+
|
| 351 |
+
- Retrieved Similar Items (Top {retrieved_count}):
|
| 352 |
+
{retrieved_items_text}
|
| 353 |
+
---
|
| 354 |
+
|
| 355 |
+
"""
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
```python
|
| 359 |
+
# Example retrieved neighbors (use COMPRESSED text for better accuracy/latency)
|
| 360 |
+
retrieved = [
|
| 361 |
+
{"id": "US-AAA", "label": "H01M10/0525", "text": "Porous carbon hosts for Li-S cathodes..."},
|
| 362 |
+
{"id": "US-BBB", "label": "H01M4/13", "text": "Conductive polymer binder for sulfur cathode..."},
|
| 363 |
+
]
|
| 364 |
+
|
| 365 |
+
retrieved_items_text = "\n".join(
|
| 366 |
+
f"- ID: {r['id']}\n Label: {r.get('label','')}\n Text: {r['text']}" for r in retrieved
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
classification_prompt = prompt_template_classification.format(
|
| 370 |
+
target_id="TARGET-1",
|
| 371 |
+
target_text=compressed, # classify on compressed text
|
| 372 |
+
retrieved_count=len(retrieved),
|
| 373 |
+
retrieved_items_text=retrieved_items_text,
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# Use stop at </answer> for clean termination
|
| 377 |
+
cls_text = generate_text(classification_prompt, GEN_CLASSIFICATION)
|
| 378 |
+
if '</answer>' not in cls_text and '<answer>' in cls_text:
|
| 379 |
+
cls_text += '</answer>'
|
| 380 |
+
|
| 381 |
+
print(cls_text)
|
| 382 |
+
parsed_ids = parse_answer_ids(cls_text)
|
| 383 |
+
print("parsed:", parsed_ids)
|
| 384 |
+
```
|
| 385 |
+
|
| 386 |
+
---
|
| 387 |
+
|
| 388 |
+
## 5) End-to-End Helper (Optional)
|
| 389 |
+
|
| 390 |
+
```python
|
| 391 |
+
def comp4cls_pipeline(abstract: str, retrieve_fn, k: int = 10) -> dict:
|
| 392 |
+
"""
|
| 393 |
+
:param abstract: raw document text
|
| 394 |
+
:param retrieve_fn: function(query_text, k) -> list of dicts [{id, label, text}, ...]
|
| 395 |
+
:param k: top-k neighbors
|
| 396 |
+
:return: {"entities": {...}, "compressed": "...", "classification_raw": "...", "parsed_ids": [...]}
|
| 397 |
+
"""
|
| 398 |
+
# Stage 1: Entities
|
| 399 |
+
ent_prompt = prompt_template_entity_extraction.format(abstract=abstract)
|
| 400 |
+
ent_text = generate_text(ent_prompt, GEN_COMMON)
|
| 401 |
+
entities = parse_json_object(ent_text)
|
| 402 |
+
|
| 403 |
+
# Stage 2: Compression
|
| 404 |
+
max_items = 10
|
| 405 |
+
categories = list(entities.keys())
|
| 406 |
+
total_items = sum(len(v) for v in entities.values())
|
| 407 |
+
comp_prompt = prompt_template_compression.format(
|
| 408 |
+
max_items=max_items, total_items=total_items, text=abstract, categories=categories
|
| 409 |
+
)
|
| 410 |
+
comp_text = generate_text(comp_prompt, GEN_COMMON)
|
| 411 |
+
compressed = parse_json_object(comp_text)["response"]
|
| 412 |
+
|
| 413 |
+
# Stage 3: Retrieval + Classification
|
| 414 |
+
neighbors = retrieve_fn(compressed, k=k) # [{"id","label","text"}, ...]
|
| 415 |
+
retrieved_items_text = "\n".join(
|
| 416 |
+
f"- ID: {r['id']}\n Label: {r.get('label','')}\n Text: {r['text']}" for r in neighbors
|
| 417 |
+
)
|
| 418 |
+
cls_prompt = prompt_template_classification.format(
|
| 419 |
+
target_id="TARGET-1",
|
| 420 |
+
target_text=compressed,
|
| 421 |
+
retrieved_count=len(neighbors),
|
| 422 |
+
retrieved_items_text=retrieved_items_text,
|
| 423 |
+
)
|
| 424 |
+
cls_raw = generate_text(cls_prompt, GEN_CLASSIFICATION)
|
| 425 |
+
if '</answer>' not in cls_raw and '<answer>' in cls_raw:
|
| 426 |
+
cls_raw += '</answer>'
|
| 427 |
+
parsed = parse_answer_ids(cls_raw)
|
| 428 |
+
return {"entities": entities, "compressed": compressed, "classification_raw": cls_raw, "parsed_ids": parsed}
|
| 429 |
+
```
|
| 430 |
+
|
| 431 |
+
---
|
| 432 |
+
|
| 433 |
+
## 6) Notes
|
| 434 |
+
|
| 435 |
+
- Stage-1/2 prompts demand **strict JSON**. The helper `parse_json_object` extracts the first valid JSON block.
|
| 436 |
+
- For Stage-3, keep `stop=["</answer>"]` to avoid over-generation and simplify parsing.
|
| 437 |
+
- Swap `MODEL_NAME` for your fine-tuned repo (e.g., `gsjang/lim-4b-1-0826`) if desired.
|
| 438 |
+
- Retrieval should use **compressed** texts for both query and neighbors.
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# Citation
|
| 443 |
|
| 444 |
**APA:**
|
| 445 |
|
| 446 |
+
Lim, C. (2026). Comp4Cls: Semantic Compression for Enhanced Retrieval-Augmented Classification of Real-World Scientific and Technical Documents. ICDE 2026 (submitted).
|
| 447 |
|
| 448 |
## Glossary [optional]
|
| 449 |
|