UMLS Integration
Comprehensive guide to integrating pyCTAKES with the Unified Medical Language System (UMLS).
Overview
The Unified Medical Language System (UMLS) is a comprehensive set of files and software that brings together biomedical vocabularies and standards to enable interoperability between systems. pyCTAKES provides flexible UMLS integration for concept normalization and mapping.
UMLS Components
Key Vocabularies
- SNOMED CT: Systematic clinical terminology
- RxNorm: Normalized drug names
- LOINC: Laboratory data
- ICD-10-CM: International disease classification
- CPT: Current procedural terminology
- MeSH: Medical subject headings
UMLS Identifiers
- CUI: Concept Unique Identifier (primary key)
- AUI: Atom Unique Identifier (term variants)
- SUI: String Unique Identifier (unique strings)
Setup and Configuration
UMLS License and Access
- Obtain UMLS License: Register at UTS
- Download UMLS: Get the current release
- Generate API Key: Create API key for programmatic access
Installation Options
Option 1: QuickUMLS (Recommended)
# Install QuickUMLS
pip install quickumls
# Download and install UMLS
python -m quickumls.install /path/to/umls/installation \
--destination /path/to/quickumls/data
Option 2: UMLS REST API
# Use UMLS REST API (requires API key)
import requests
class UMLSClient:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://uts-ws.nlm.nih.gov/rest"
def search_concept(self, term):
# Implementation for REST API calls
pass
Option 3: Local Dictionary
# Use simplified local dictionary for development
umls_dict = {
"diabetes": {
"cui": "C0011849",
"preferred_term": "Diabetes Mellitus",
"semantic_types": ["T047"]
}
}
pyCTAKES UMLS Integration
Basic UMLS Annotator
from pyctakes.annotators.umls import UMLSConceptMapper
# Create UMLS annotator
umls = UMLSConceptMapper(
umls_path="/path/to/quickumls/data",
similarity_threshold=0.8,
max_candidates=5
)
# Add to pipeline
pipeline.add_annotator(umls)
# Process text
doc = pipeline.process_text("Patient has diabetes and hypertension.")
# Access UMLS concepts
for entity in doc.entities:
if entity.umls_concept:
concept = entity.umls_concept
print(f"{entity.text} -> {concept.cui} ({concept.preferred_term})")
Configuration Options
umls_config = {
"umls": {
"enabled": True,
"umls_path": "/path/to/quickumls/data",
"similarity_threshold": 0.8, # Minimum similarity score
"max_candidates": 5, # Maximum candidates per term
"semantic_types": [ # Filter by semantic types
"T047", # Disease or Syndrome
"T184", # Sign or Symptom
"T121" # Pharmacologic Substance
],
"sources": [ # Filter by vocabulary sources
"SNOMEDCT_US", "RXNORM", "ICD10CM"
],
"overlaps": "length", # Handle overlapping matches
"threshold": 0.8, # QuickUMLS threshold
"window": 5, # Context window size
"similarity_name": "jaccard", # Similarity function
"accepted_semtypes": None, # Semantic type filter
"enable_caching": True, # Enable result caching
"cache_size": 10000 # Cache size limit
}
}
Advanced UMLS Annotator
from pyctakes.annotators.umls import AdvancedUMLSAnnotator
from pyctakes.types import UMLSConcept
class AdvancedUMLSAnnotator(BaseAnnotator):
def __init__(self, umls_path, **kwargs):
super().__init__()
# Initialize QuickUMLS
from quickumls import QuickUMLS
self.matcher = QuickUMLS(
umls_path,
overlapping_criteria="length",
threshold=kwargs.get("threshold", 0.8),
similarity_name=kwargs.get("similarity", "jaccard"),
window=kwargs.get("window", 5)
)
self.config = kwargs
self.cache = {}
def process(self, doc: Document) -> Document:
# Process each entity
for entity in doc.entities:
concepts = self._map_entity_to_umls(entity)
if concepts:
# Select best concept
entity.umls_concept = self._select_best_concept(concepts)
return doc
def _map_entity_to_umls(self, entity):
"""Map entity text to UMLS concepts."""
# Check cache first
cache_key = entity.text.lower()
if cache_key in self.cache:
return self.cache[cache_key]
# Query QuickUMLS
matches = self.matcher.match(entity.text)
concepts = []
for match in matches:
for candidate in match:
concept = UMLSConcept(
cui=candidate['cui'],
preferred_term=candidate['preferred'],
semantic_types=candidate['semtypes'],
sources=candidate.get('sources', []),
confidence=candidate['similarity']
)
concepts.append(concept)
# Filter by configuration
concepts = self._filter_concepts(concepts)
# Cache results
self.cache[cache_key] = concepts
return concepts
def _filter_concepts(self, concepts):
"""Filter concepts by configuration."""
filtered = []
for concept in concepts:
# Filter by confidence
if concept.confidence < self.config.get("similarity_threshold", 0.8):
continue
# Filter by semantic types
if "semantic_types" in self.config:
if not any(st in concept.semantic_types
for st in self.config["semantic_types"]):
continue
# Filter by sources
if "sources" in self.config:
if not any(src in concept.sources
for src in self.config["sources"]):
continue
filtered.append(concept)
return filtered[:self.config.get("max_candidates", 5)]
def _select_best_concept(self, concepts):
"""Select the best concept from candidates."""
if not concepts:
return None
# Sort by confidence and return best
concepts.sort(key=lambda x: x.confidence, reverse=True)
return concepts[0]
Semantic Types
Common Semantic Types in Clinical NLP
CLINICAL_SEMANTIC_TYPES = {
# Disorders
"T047": "Disease or Syndrome",
"T048": "Mental or Behavioral Dysfunction",
"T191": "Neoplastic Process",
"T046": "Pathologic Function",
# Signs and Symptoms
"T184": "Sign or Symptom",
"T033": "Finding",
# Anatomy
"T017": "Anatomical Structure",
"T029": "Body Location or Region",
"T023": "Body Part, Organ, or Organ Component",
# Procedures
"T060": "Diagnostic Procedure",
"T061": "Therapeutic or Preventive Procedure",
"T059": "Laboratory Procedure",
# Substances
"T121": "Pharmacologic Substance",
"T200": "Clinical Drug",
"T103": "Chemical",
# Organizations
"T093": "Health Care Related Organization",
"T073": "Manufactured Object"
}
# Filter by clinical semantic types
clinical_filter = list(CLINICAL_SEMANTIC_TYPES.keys())
Semantic Type Filtering
def filter_by_semantic_types(concepts, allowed_types):
"""Filter concepts by semantic types."""
filtered = []
for concept in concepts:
if any(st in concept.semantic_types for st in allowed_types):
filtered.append(concept)
return filtered
# Usage
medication_types = ["T121", "T200"] # Pharmacologic substances and drugs
med_concepts = filter_by_semantic_types(concepts, medication_types)
Entity-Specific UMLS Mapping
Medication Mapping
class MedicationUMLSMapper(BaseAnnotator):
def __init__(self, **kwargs):
super().__init__()
# Focus on RxNorm for medications
self.umls = UMLSConceptMapper(
sources=["RXNORM"],
semantic_types=["T121", "T200"], # Pharmacologic substances
similarity_threshold=0.85
)
def process(self, doc: Document) -> Document:
# Only process medication entities
med_entities = [e for e in doc.entities if e.label == "MEDICATION"]
for entity in med_entities:
# Map to RxNorm concepts
concepts = self.umls.map_entity(entity)
if concepts:
entity.umls_concept = concepts[0]
# Add RxNorm-specific information
entity.rxnorm_cui = concepts[0].cui
entity.generic_name = self._get_generic_name(concepts[0])
return doc
Condition Mapping
class ConditionUMLSMapper(BaseAnnotator):
def __init__(self, **kwargs):
super().__init__()
# Focus on SNOMED CT for conditions
self.umls = UMLSConceptMapper(
sources=["SNOMEDCT_US"],
semantic_types=["T047", "T048", "T191"], # Diseases and disorders
similarity_threshold=0.8
)
def process(self, doc: Document) -> Document:
condition_entities = [e for e in doc.entities
if e.label in ["CONDITION", "DISORDER"]]
for entity in condition_entities:
concepts = self.umls.map_entity(entity)
if concepts:
entity.umls_concept = concepts[0]
# Add SNOMED-specific information
entity.snomed_code = concepts[0].cui
entity.icd10_codes = self._get_icd10_mappings(concepts[0])
return doc
Performance Optimization
Caching Strategies
from functools import lru_cache
import pickle
class CachedUMLSMapper:
def __init__(self, cache_file="umls_cache.pkl"):
self.cache_file = cache_file
self.cache = self._load_cache()
def _load_cache(self):
try:
with open(self.cache_file, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
return {}
def _save_cache(self):
with open(self.cache_file, 'wb') as f:
pickle.dump(self.cache, f)
@lru_cache(maxsize=10000)
def map_term(self, term):
"""Map term to UMLS with caching."""
if term in self.cache:
return self.cache[term]
# Perform UMLS lookup
concepts = self._umls_lookup(term)
# Cache result
self.cache[term] = concepts
return concepts
Batch Processing
def process_entities_batch(entities, batch_size=100):
"""Process entities in batches for better performance."""
results = []
for i in range(0, len(entities), batch_size):
batch = entities[i:i + batch_size]
# Extract unique terms
unique_terms = list(set(e.text.lower() for e in batch))
# Batch lookup
term_concepts = {}
for term in unique_terms:
term_concepts[term] = umls_mapper.map_term(term)
# Assign concepts to entities
for entity in batch:
concepts = term_concepts.get(entity.text.lower(), [])
if concepts:
entity.umls_concept = concepts[0]
results.extend(batch)
return results
Custom UMLS Integration
Local UMLS Dictionary
class LocalUMLSMapper(BaseAnnotator):
def __init__(self, dictionary_path):
super().__init__()
self.concepts = self._load_dictionary(dictionary_path)
def _load_dictionary(self, path):
"""Load local UMLS dictionary."""
import json
with open(path, 'r') as f:
data = json.load(f)
# Build efficient lookup structure
lookup = {}
for cui, concept_data in data.items():
terms = concept_data.get('terms', [])
for term in terms:
lookup[term.lower()] = UMLSConcept(
cui=cui,
preferred_term=concept_data['preferred_term'],
semantic_types=concept_data['semantic_types'],
confidence=1.0
)
return lookup
def process(self, doc: Document) -> Document:
for entity in doc.entities:
concept = self.concepts.get(entity.text.lower())
if concept:
entity.umls_concept = concept
return doc
REST API Integration
import requests
from functools import lru_cache
class UMLSRESTMapper(BaseAnnotator):
def __init__(self, api_key):
super().__init__()
self.api_key = api_key
self.base_url = "https://uts-ws.nlm.nih.gov/rest"
self.session = requests.Session()
@lru_cache(maxsize=5000)
def _get_ticket(self):
"""Get authentication ticket."""
url = f"{self.base_url}/security/ticket"
response = self.session.post(url, data={"apikey": self.api_key})
return response.text
@lru_cache(maxsize=10000)
def _search_concept(self, term):
"""Search for concept via REST API."""
ticket = self._get_ticket()
url = f"{self.base_url}/search/current"
params = {
"string": term,
"ticket": ticket,
"pageSize": 5
}
response = self.session.get(url, params=params)
if response.status_code == 200:
data = response.json()
concepts = []
for result in data.get("result", {}).get("results", []):
concept = UMLSConcept(
cui=result["ui"],
preferred_term=result["name"],
semantic_types=[], # Would need additional API call
confidence=1.0
)
concepts.append(concept)
return concepts
return []
def process(self, doc: Document) -> Document:
for entity in doc.entities:
concepts = self._search_concept(entity.text)
if concepts:
entity.umls_concept = concepts[0]
return doc
Evaluation and Quality Assurance
Concept Mapping Evaluation
def evaluate_umls_mapping(gold_standard, predictions):
"""Evaluate UMLS concept mapping quality."""
correct_cuis = 0
total_predictions = 0
for gold, pred in zip(gold_standard, predictions):
if pred.umls_concept:
total_predictions += 1
if pred.umls_concept.cui == gold.expected_cui:
correct_cuis += 1
precision = correct_cuis / total_predictions if total_predictions > 0 else 0
recall = correct_cuis / len(gold_standard)
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"precision": precision,
"recall": recall,
"f1": f1,
"total_mapped": total_predictions,
"total_gold": len(gold_standard)
}
Concept Validation
def validate_umls_concepts(entities):
"""Validate UMLS concept assignments."""
validation_results = []
for entity in entities:
if entity.umls_concept:
concept = entity.umls_concept
# Check CUI format
cui_valid = bool(re.match(r'^C\d{7}$', concept.cui))
# Check semantic type validity
semtype_valid = all(
st.startswith('T') and len(st) == 4
for st in concept.semantic_types
)
# Check confidence range
confidence_valid = 0.0 <= concept.confidence <= 1.0
validation_results.append({
"entity": entity.text,
"cui": concept.cui,
"cui_valid": cui_valid,
"semtype_valid": semtype_valid,
"confidence_valid": confidence_valid,
"overall_valid": cui_valid and semtype_valid and confidence_valid
})
return validation_results
Best Practices
1. Choose Appropriate Vocabularies
- Medications: Use RxNorm
- Conditions: Use SNOMED CT
- Lab Tests: Use LOINC
- Procedures: Use CPT/SNOMED CT
2. Optimize Performance
- Enable caching for repeated lookups
- Use batch processing for multiple entities
- Filter by semantic types to reduce candidates
- Set appropriate similarity thresholds
3. Quality Assurance
- Validate CUI formats and semantic types
- Evaluate mapping accuracy on gold standard data
- Monitor confidence scores and adjust thresholds
- Review unmapped entities regularly
4. Handle Edge Cases
- Implement fallback for unmapped entities
- Handle abbreviations and synonyms
- Consider context for disambiguation
- Manage overlapping concept matches
5. Stay Updated
- Update UMLS releases regularly
- Monitor vocabulary changes
- Validate mappings after updates
- Document version dependencies