Loading...
Loading...
02-reusable-code-python/utils/expression_matcher.py
"""
@source: 260313 heath-infer-step01
@extracted: 2026-03-14
@description: 의성어/의태어, 은유 표현, 정도 부사 분석기.
도메인 키워드 사전을 주입하여 텍스트에서 패턴 매칭 + 강도 분석 수행.
사용법:
# 사전 데이터 (JSON 또는 딕셔너리)
onomatopoeia = {
"쿵쿵": {"pain_type": "throbbing", "intensity": 0.7, "description": "박동성 통증"},
"찌릿찌릿": {"pain_type": "electric", "intensity": 0.8, "description": "전기 느낌"},
}
intensity_adverbs = {
"매우": {"score": 0.9, "category": "severe"},
"조금": {"score": 0.3, "category": "mild"},
}
matcher = ExpressionMatcher(
onomatopoeia=onomatopoeia,
intensity_adverbs=intensity_adverbs,
)
matches = matcher.match_expression("쿵쿵거리는 통증")
intensity = matcher.analyze_intensity("매우 심한 통증")
"""
import re
from dataclasses import dataclass
from enum import Enum
from typing import Any
class ExpressionType(str, Enum):
"""표현 유형"""
ONOMATOPOEIA = "onomatopoeia" # 의성어/의태어
METAPHOR = "metaphor" # 은유 표현
INTENSITY = "intensity" # 정도 부사
DIRECT = "direct" # 직접 표현
class UrgencyLevel(str, Enum):
"""긴급도"""
CRITICAL = "critical"
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
@dataclass
class ExpressionMatch:
"""표현 매칭 결과"""
matched_text: str # 매칭된 텍스트
expression_type: ExpressionType # 표현 유형
category: str # 카테고리 (도메인별 분류)
candidate_ids: list[str] # 후보 ID 목록
intensity: float # 강도 (0.0 ~ 1.0)
urgency: UrgencyLevel # 긴급도
confidence: float # 신뢰도
description: str # 설명
note: str | None = None # 추가 메모
@dataclass
class IntensityAnalysis:
"""강도 분석 결과"""
detected_adverb: str | None # 감지된 부사
score: float # 강도 점수 (0.0 ~ 1.0)
category: str # 카테고리 (severe, moderate 등)
is_negation: bool # 부정 표현 여부
is_progressive: bool # 진행형 여부 (점점 더)
class ExpressionMatcher:
"""
확장 표현 매처
의성어/의태어, 은유 표현, 정도 부사를 분석하여
텍스트 검색 품질을 향상시키는 범용 모듈.
도메인별 사전을 주입하여 사용합니다.
"""
def __init__(
self,
onomatopoeia: dict[str, Any] | None = None,
metaphors: dict[str, Any] | None = None,
intensity_adverbs: dict[str, Any] | None = None,
category_to_ids: dict[str, list[str]] | None = None,
):
"""
Args:
onomatopoeia: 의성어/의태어 사전.
키: 표현 문자열, 값: {category, intensity, description, candidate_ids, ...}
또는 {"onomatopoeia": {...}} 형태도 지원
metaphors: 은유 표현 사전.
{"metaphor_categories": {카테고리: {description, patterns: [{pattern, intensity, urgency, category}]}},
"alerts": [{trigger: [...], alert, urgency}]}
intensity_adverbs: 정도 부사 사전.
키: 부사, 값: {score, category, synonyms?}
또는 {"intensity_adverbs": {...}} 형태도 지원
category_to_ids: 카테고리 -> ID 목록 매핑 (기본: 빈 딕셔너리)
"""
# 카테고리 → ID 매핑
self.category_to_ids = category_to_ids or {}
# 의성어/의태어 데이터
self.onomatopoeia = {}
if onomatopoeia:
self.onomatopoeia = onomatopoeia.get("onomatopoeia", onomatopoeia)
# 은유 표현 데이터
self.metaphor_categories = {}
self.alerts = []
if metaphors:
self.metaphor_categories = metaphors.get("metaphor_categories", {})
self.alerts = metaphors.get("alerts", [])
# 정도 부사 데이터
self.intensity_adverbs = {}
if intensity_adverbs:
self.intensity_adverbs = intensity_adverbs.get("intensity_adverbs", intensity_adverbs)
# 정렬된 패턴 (긴 것 우선)
self._sorted_onomatopoeia = sorted(
self.onomatopoeia.items(),
key=lambda x: len(x[0]),
reverse=True
)
# 은유 패턴 정규화
self._metaphor_patterns: list[tuple[str, dict[str, Any], str]] = []
for category, info in self.metaphor_categories.items():
for pattern_info in info.get("patterns", []):
self._metaphor_patterns.append((
pattern_info["pattern"],
pattern_info,
category
))
# 긴 패턴 우선 정렬
self._metaphor_patterns.sort(key=lambda x: len(x[0]), reverse=True)
def match_expression(self, text: str) -> list[ExpressionMatch]:
"""
텍스트에서 모든 표현 매칭
Args:
text: 입력 텍스트
Returns:
매칭된 표현 목록
"""
results: list[ExpressionMatch] = []
text_lower = text.lower()
# 1. 의성어/의태어 매칭
ono_matches = self._match_onomatopoeia(text_lower)
results.extend(ono_matches)
# 2. 은유 표현 매칭
metaphor_matches = self._match_metaphors(text_lower)
results.extend(metaphor_matches)
return results
def _match_onomatopoeia(self, text: str) -> list[ExpressionMatch]:
"""의성어/의태어 매칭"""
results: list[ExpressionMatch] = []
matched_positions: set[tuple[int, int]] = set()
for ono_term, info in self._sorted_onomatopoeia:
start = 0
while True:
pos = text.find(ono_term, start)
if pos == -1:
break
end_pos = pos + len(ono_term)
# 이미 매칭된 위치와 겹치는지 확인
overlaps = any(
not (end_pos <= s or pos >= e)
for s, e in matched_positions
)
if not overlaps:
matched_positions.add((pos, end_pos))
intensity = info.get("intensity", 0.5)
if intensity >= 0.9:
urgency = UrgencyLevel.HIGH
elif intensity >= 0.7:
urgency = UrgencyLevel.MEDIUM
else:
urgency = UrgencyLevel.LOW
# 카테고리에서 ID 후보 조회
cat = info.get("category", "unknown")
candidate_ids = info.get("candidate_ids", self.category_to_ids.get(cat, []))
results.append(ExpressionMatch(
matched_text=ono_term,
expression_type=ExpressionType.ONOMATOPOEIA,
category=cat,
candidate_ids=candidate_ids,
intensity=intensity,
urgency=urgency,
confidence=0.85,
description=info.get("description", ""),
))
start = end_pos
return results
def _match_metaphors(self, text: str) -> list[ExpressionMatch]:
"""은유 표현 매칭"""
results: list[ExpressionMatch] = []
matched_positions: set[tuple[int, int]] = set()
for pattern, info, category in self._metaphor_patterns:
start = 0
while True:
pos = text.find(pattern, start)
if pos == -1:
break
end_pos = pos + len(pattern)
overlaps = any(
not (end_pos <= s or pos >= e)
for s, e in matched_positions
)
if not overlaps:
matched_positions.add((pos, end_pos))
urgency_str = info.get("urgency", "medium")
try:
urgency = UrgencyLevel(urgency_str)
except ValueError:
urgency = UrgencyLevel.MEDIUM
category_info = self.metaphor_categories.get(category, {})
cat = info.get("category", category)
candidate_ids = info.get("candidate_ids", self.category_to_ids.get(cat, []))
results.append(ExpressionMatch(
matched_text=pattern,
expression_type=ExpressionType.METAPHOR,
category=cat,
candidate_ids=candidate_ids,
intensity=info.get("intensity", 0.5),
urgency=urgency,
confidence=0.8,
description=category_info.get("description", ""),
note=category_info.get("note"),
))
start = end_pos
return results
def analyze_intensity(self, text: str) -> IntensityAnalysis:
"""
텍스트의 강도 분석
Args:
text: 입력 텍스트
Returns:
IntensityAnalysis
"""
text_lower = text.lower()
detected_adverb = None
detected_score = None
category = "moderate"
is_negation = False
is_progressive = False
for adverb, info in self.intensity_adverbs.items():
score = info.get("score", 0.5)
adv_category = info.get("category", "moderate")
# 부정 표현 체크
if adv_category in ("negation", "complete_negation", "mild_negation"):
if adverb in text_lower:
is_negation = True
continue
# 진행형 체크
if adv_category == "progressive":
if adverb in text_lower:
is_progressive = True
continue
# 메인 부사 체크
if adverb in text_lower:
if detected_score is None or score > detected_score:
detected_score = score
detected_adverb = adverb
category = adv_category
continue
# 동의어 체크
for synonym in info.get("synonyms", []):
if synonym in text_lower:
if detected_score is None or score > detected_score:
detected_score = score
detected_adverb = synonym
category = adv_category
break
final_score = detected_score if detected_score is not None else 0.5
return IntensityAnalysis(
detected_adverb=detected_adverb,
score=final_score,
category=category,
is_negation=is_negation,
is_progressive=is_progressive,
)
def check_alerts(self, text: str) -> list[dict[str, Any]]:
"""
도메인 경고 확인 (사전에 등록된 트리거 패턴 매칭)
Args:
text: 입력 텍스트
Returns:
트리거된 경고 목록
"""
triggered: list[dict[str, Any]] = []
text_lower = text.lower()
for alert in self.alerts:
triggers = alert.get("trigger", [])
# 모든 트리거가 텍스트에 있는지 확인
if all(trigger in text_lower for trigger in triggers):
triggered.append({
"trigger": triggers,
"alert": alert.get("alert", ""),
"urgency": alert.get("urgency", "medium"),
})
return triggered
def get_ids_from_expression(self, text: str) -> list[tuple[str, float, str]]:
"""
표현에서 후보 ID 추출
Args:
text: 입력 텍스트
Returns:
(id, confidence, source) 튜플 목록
"""
results: list[tuple[str, float, str]] = []
seen_ids: set[str] = set()
matches = self.match_expression(text)
for match in matches:
for cid in match.candidate_ids:
if cid not in seen_ids:
seen_ids.add(cid)
results.append((
cid,
match.confidence * match.intensity,
match.expression_type.value
))
return results
def create_expression_matcher(
onomatopoeia: dict[str, Any] | None = None,
metaphors: dict[str, Any] | None = None,
intensity_adverbs: dict[str, Any] | None = None,
category_to_ids: dict[str, list[str]] | None = None,
) -> ExpressionMatcher:
"""
ExpressionMatcher 팩토리 함수
Args:
onomatopoeia: 의성어/의태어 사전
metaphors: 은유 표현 사전
intensity_adverbs: 정도 부사 사전
category_to_ids: 카테고리 -> ID 목록 매핑
Returns:
ExpressionMatcher 인스턴스
"""
return ExpressionMatcher(
onomatopoeia=onomatopoeia,
metaphors=metaphors,
intensity_adverbs=intensity_adverbs,
category_to_ids=category_to_ids,
)