Custom Annotators
Learn how to create and integrate custom annotators into the pyCTAKES pipeline.
Overview
pyCTAKES provides a flexible architecture for creating custom annotators that can be seamlessly integrated into processing pipelines. Custom annotators allow you to:
- Implement domain-specific processing logic
- Integrate external NLP models or services
- Add proprietary algorithms or rules
- Extend functionality for specific use cases
Base Annotator Interface
All annotators must extend the BaseAnnotator
class:
from pyctakes.annotators.base import BaseAnnotator
from pyctakes.types import Document
class CustomAnnotator(BaseAnnotator):
def __init__(self, **kwargs):
super().__init__()
# Initialize your annotator
def process(self, doc: Document) -> Document:
# Your processing logic here
return doc
Simple Custom Annotator Example
Here's a basic custom annotator that identifies phone numbers:
import re
from pyctakes.annotators.base import BaseAnnotator
from pyctakes.types import Document, Entity
class PhoneNumberAnnotator(BaseAnnotator):
def __init__(self, **kwargs):
super().__init__()
self.phone_pattern = re.compile(
r'\b(?:\+?1[-.\s]?)?'
r'\(?([0-9]{3})\)?[-.\s]?'
r'([0-9]{3})[-.\s]?'
r'([0-9]{4})\b'
)
def process(self, doc: Document) -> Document:
# Find phone numbers in text
for match in self.phone_pattern.finditer(doc.text):
entity = Entity(
start=match.start(),
end=match.end(),
text=match.group(),
label="PHONE_NUMBER",
confidence=1.0
)
doc.entities.append(entity)
return doc
Advanced Custom Annotator
Here's a more sophisticated annotator that integrates with an external API:
import requests
from typing import Dict, Any
from pyctakes.annotators.base import BaseAnnotator
from pyctakes.types import Document, Entity, UMLSConcept
class ExternalNERAnnotator(BaseAnnotator):
def __init__(self, api_url: str, api_key: str, **kwargs):
super().__init__()
self.api_url = api_url
self.api_key = api_key
self.headers = {"Authorization": f"Bearer {api_key}"}
self.config = kwargs
def process(self, doc: Document) -> Document:
try:
# Call external NER API
response = self._call_api(doc.text)
# Process API response
entities = self._parse_response(response)
# Add entities to document
doc.entities.extend(entities)
except Exception as e:
self.logger.error(f"External API failed: {e}")
# Optionally fall back to rule-based approach
return doc
def _call_api(self, text: str) -> Dict[str, Any]:
payload = {
"text": text,
"options": self.config
}
response = requests.post(
self.api_url,
json=payload,
headers=self.headers,
timeout=30
)
response.raise_for_status()
return response.json()
def _parse_response(self, response: Dict[str, Any]) -> List[Entity]:
entities = []
for item in response.get("entities", []):
entity = Entity(
start=item["start"],
end=item["end"],
text=item["text"],
label=item["label"],
confidence=item.get("confidence", 1.0)
)
# Add UMLS concept if available
if "concept" in item:
entity.umls_concept = UMLSConcept(
cui=item["concept"]["cui"],
preferred_term=item["concept"]["name"],
confidence=item["concept"]["score"]
)
entities.append(entity)
return entities
Configuration Support
Add configuration support to your custom annotators:
from pyctakes.annotators.base import BaseAnnotator
class ConfigurableAnnotator(BaseAnnotator):
def __init__(self, config: Dict[str, Any] = None, **kwargs):
super().__init__()
# Default configuration
self.config = {
"case_sensitive": False,
"min_confidence": 0.5,
"max_entities": 100,
"custom_patterns": {}
}
# Update with provided config
if config:
self.config.update(config)
# Update with keyword arguments
self.config.update(kwargs)
# Initialize with configuration
self._initialize()
def _initialize(self):
# Setup based on configuration
self.case_sensitive = self.config["case_sensitive"]
self.min_confidence = self.config["min_confidence"]
# ... other initialization
Error Handling
Implement robust error handling in your annotators:
from pyctakes.annotators.base import BaseAnnotator, AnnotationError
class RobustAnnotator(BaseAnnotator):
def process(self, doc: Document) -> Document:
try:
return self._process_safely(doc)
except Exception as e:
self.logger.error(f"Annotation failed: {e}")
# Option 1: Re-raise as AnnotationError
raise AnnotationError(f"RobustAnnotator failed: {e}")
# Option 2: Return document unchanged
# return doc
# Option 3: Apply fallback processing
# return self._fallback_process(doc)
def _process_safely(self, doc: Document) -> Document:
# Your main processing logic
pass
def _fallback_process(self, doc: Document) -> Document:
# Simplified fallback processing
pass
Multi-language Support
Create annotators that support multiple languages:
from pyctakes.annotators.base import BaseAnnotator
class MultilingualAnnotator(BaseAnnotator):
def __init__(self, language: str = "en", **kwargs):
super().__init__()
self.language = language
self.patterns = self._load_patterns(language)
def _load_patterns(self, language: str) -> Dict[str, List[str]]:
patterns = {
"en": {
"MEDICATION": ["aspirin", "ibuprofen"],
"CONDITION": ["diabetes", "hypertension"]
},
"es": {
"MEDICATION": ["aspirina", "ibuprofeno"],
"CONDITION": ["diabetes", "hipertensión"]
}
}
return patterns.get(language, patterns["en"])
def process(self, doc: Document) -> Document:
# Process using language-specific patterns
for label, terms in self.patterns.items():
doc = self._find_entities(doc, terms, label)
return doc
Performance Optimization
Optimize your annotators for better performance:
from functools import lru_cache
from pyctakes.annotators.base import BaseAnnotator
class OptimizedAnnotator(BaseAnnotator):
def __init__(self, **kwargs):
super().__init__()
# Pre-compile patterns for better performance
self.compiled_patterns = self._compile_patterns()
# Cache for expensive operations
self._cache = {}
@lru_cache(maxsize=1000)
def _expensive_operation(self, text: str) -> str:
# Expensive computation cached with LRU
pass
def _compile_patterns(self):
# Pre-compile regex patterns
patterns = {}
for label, pattern_list in self.raw_patterns.items():
patterns[label] = [re.compile(p, re.IGNORECASE)
for p in pattern_list]
return patterns
def process(self, doc: Document) -> Document:
# Use compiled patterns for faster matching
for label, patterns in self.compiled_patterns.items():
for pattern in patterns:
for match in pattern.finditer(doc.text):
# Process match
pass
return doc
Testing Custom Annotators
Create comprehensive tests for your custom annotators:
import pytest
from pyctakes.types import Document
from your_module import CustomAnnotator
class TestCustomAnnotator:
def setup_method(self):
self.annotator = CustomAnnotator()
def test_basic_functionality(self):
doc = Document(text="Test input text")
result = self.annotator.process(doc)
assert len(result.entities) > 0
assert result.entities[0].label == "EXPECTED_LABEL"
def test_empty_input(self):
doc = Document(text="")
result = self.annotator.process(doc)
assert len(result.entities) == 0
def test_configuration(self):
annotator = CustomAnnotator(custom_param="value")
doc = Document(text="Test text")
result = annotator.process(doc)
# Test configuration effects
assert result is not None
def test_error_handling(self):
# Test that annotator handles errors gracefully
doc = Document(text="Text that might cause errors")
# Should not raise exception
result = self.annotator.process(doc)
assert result is not None
Integration with Pipeline
Register your custom annotator for use in pipelines:
from pyctakes.pipeline import Pipeline
# Method 1: Direct addition
pipeline = Pipeline()
pipeline.add_annotator(CustomAnnotator())
# Method 2: Configuration-based
config = {
"annotators": [
{
"name": "custom",
"class": "your_module.CustomAnnotator",
"config": {
"param1": "value1"
}
}
]
}
pipeline = Pipeline.from_config(config)
Plugin Architecture
Create installable plugins for your annotators:
# setup.py
from setuptools import setup
setup(
name="pyctakes-custom-plugin",
version="1.0.0",
packages=["pyctakes_custom"],
entry_points={
"pyctakes.annotators": [
"custom = pyctakes_custom:CustomAnnotator"
]
}
)
Best Practices
- Follow naming conventions: Use descriptive class names ending in "Annotator"
- Document thoroughly: Include docstrings and type hints
- Handle errors gracefully: Don't let your annotator crash the pipeline
- Make it configurable: Support configuration for flexibility
- Test extensively: Unit tests and integration tests
- Optimize for performance: Profile and optimize bottlenecks
- Support standard types: Use pyCTAKES type system correctly
- Log appropriately: Use logging for debugging and monitoring
Example: Complete Custom Annotator
Here's a complete example of a well-structured custom annotator:
"""
Custom vital signs annotator for pyCTAKES.
"""
import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from pyctakes.annotators.base import BaseAnnotator
from pyctakes.types import Document, Entity
@dataclass
class VitalSign:
name: str
value: float
unit: str
normal_range: tuple
class VitalSignsAnnotator(BaseAnnotator):
"""
Annotator for extracting vital signs from clinical text.
Recognizes common vital signs like blood pressure, heart rate,
temperature, respiratory rate, and oxygen saturation.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None, **kwargs):
super().__init__()
self.config = {
"include_normal_ranges": True,
"min_confidence": 0.8,
"custom_patterns": {}
}
if config:
self.config.update(config)
self.config.update(kwargs)
self.logger = logging.getLogger(__name__)
self._compile_patterns()
def _compile_patterns(self) -> None:
"""Compile regex patterns for vital signs recognition."""
patterns = {
"BLOOD_PRESSURE": [
r'\b(?:BP|Blood Pressure):\s*(\d{2,3})/(\d{2,3})\s*(?:mmHg)?\b',
r'\b(\d{2,3})/(\d{2,3})\s*mmHg\b'
],
"HEART_RATE": [
r'\b(?:HR|Heart Rate):\s*(\d{2,3})\s*(?:bpm)?\b',
r'\bPulse:\s*(\d{2,3})\s*(?:bpm)?\b'
],
"TEMPERATURE": [
r'\b(?:Temp|Temperature):\s*(\d{2,3}(?:\.\d)?)\s*°?([CF])?\b',
r'\b(\d{2,3}(?:\.\d)?)\s*°([CF])\b'
]
}
# Add custom patterns
patterns.update(self.config.get("custom_patterns", {}))
# Compile patterns
self.compiled_patterns = {}
for vital_type, pattern_list in patterns.items():
self.compiled_patterns[vital_type] = [
re.compile(pattern, re.IGNORECASE) for pattern in pattern_list
]
def process(self, doc: Document) -> Document:
"""
Process document to extract vital signs.
Args:
doc: Input document
Returns:
Document with vital signs entities added
"""
try:
for vital_type, patterns in self.compiled_patterns.items():
entities = self._extract_vital_signs(doc.text, vital_type, patterns)
doc.entities.extend(entities)
self.logger.debug(f"Extracted {len(doc.entities)} vital signs")
except Exception as e:
self.logger.error(f"Vital signs extraction failed: {e}")
return doc
def _extract_vital_signs(self, text: str, vital_type: str,
patterns: List[re.Pattern]) -> List[Entity]:
"""Extract specific type of vital signs from text."""
entities = []
for pattern in patterns:
for match in pattern.finditer(text):
entity = self._create_entity(match, vital_type)
if entity and entity.confidence >= self.config["min_confidence"]:
entities.append(entity)
return entities
def _create_entity(self, match: re.Match, vital_type: str) -> Optional[Entity]:
"""Create entity from regex match."""
try:
vital_sign = self._parse_vital_sign(match, vital_type)
confidence = self._calculate_confidence(vital_sign)
entity = Entity(
start=match.start(),
end=match.end(),
text=match.group(),
label=vital_type,
confidence=confidence
)
# Add structured data
entity.vital_sign = vital_sign
return entity
except Exception as e:
self.logger.warning(f"Failed to parse vital sign: {e}")
return None
def _parse_vital_sign(self, match: re.Match, vital_type: str) -> VitalSign:
"""Parse vital sign from regex match."""
if vital_type == "BLOOD_PRESSURE":
systolic = int(match.group(1))
diastolic = int(match.group(2))
return VitalSign(
name="Blood Pressure",
value=(systolic, diastolic),
unit="mmHg",
normal_range=(90, 140)
)
elif vital_type == "HEART_RATE":
rate = int(match.group(1))
return VitalSign(
name="Heart Rate",
value=rate,
unit="bpm",
normal_range=(60, 100)
)
# Add other vital signs...
raise ValueError(f"Unknown vital type: {vital_type}")
def _calculate_confidence(self, vital_sign: VitalSign) -> float:
"""Calculate confidence based on vital sign reasonableness."""
# Simple heuristic: higher confidence for values in normal range
base_confidence = 0.9
if self._is_in_normal_range(vital_sign):
return base_confidence
else:
return base_confidence * 0.8
def _is_in_normal_range(self, vital_sign: VitalSign) -> bool:
"""Check if vital sign is in normal range."""
if vital_sign.name == "Blood Pressure":
systolic, diastolic = vital_sign.value
return (90 <= systolic <= 140) and (60 <= diastolic <= 90)
elif vital_sign.name == "Heart Rate":
return 60 <= vital_sign.value <= 100
return True # Default to normal if unknown
This example demonstrates all the best practices for creating robust, configurable, and well-tested custom annotators for pyCTAKES.