@source: 260313 heath-infer-step01

02-reusable-code-python/utils/expression_matcher.py
reusable
python
"""
@source: 260313 heath-infer-step01
@extracted: 2026-03-14
@description: 의성어/의태어, 은유 표현, 정도 부사 분석기.
              도메인 키워드 사전을 주입하여 텍스트에서 패턴 매칭 + 강도 분석 수행.

사용법:
    # 사전 데이터 (JSON 또는 딕셔너리)
    onomatopoeia = {
        "쿵쿵": {"pain_type": "throbbing", "intensity": 0.7, "description": "박동성 통증"},
        "찌릿찌릿": {"pain_type": "electric", "intensity": 0.8, "description": "전기 느낌"},
    }
    intensity_adverbs = {
        "매우": {"score": 0.9, "category": "severe"},
        "조금": {"score": 0.3, "category": "mild"},
    }

    matcher = ExpressionMatcher(
        onomatopoeia=onomatopoeia,
        intensity_adverbs=intensity_adverbs,
    )
    matches = matcher.match_expression("쿵쿵거리는 통증")
    intensity = matcher.analyze_intensity("매우 심한 통증")
"""
import re
from dataclasses import dataclass
from enum import Enum
from typing import Any


class ExpressionType(str, Enum):
    """표현 유형"""
    ONOMATOPOEIA = "onomatopoeia"  # 의성어/의태어
    METAPHOR = "metaphor"          # 은유 표현
    INTENSITY = "intensity"        # 정도 부사
    DIRECT = "direct"              # 직접 표현


class UrgencyLevel(str, Enum):
    """긴급도"""
    CRITICAL = "critical"
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"


@dataclass
class ExpressionMatch:
    """표현 매칭 결과"""
    matched_text: str              # 매칭된 텍스트
    expression_type: ExpressionType  # 표현 유형
    category: str                  # 카테고리 (도메인별 분류)
    candidate_ids: list[str]       # 후보 ID 목록
    intensity: float               # 강도 (0.0 ~ 1.0)
    urgency: UrgencyLevel          # 긴급도
    confidence: float              # 신뢰도
    description: str               # 설명
    note: str | None = None        # 추가 메모


@dataclass
class IntensityAnalysis:
    """강도 분석 결과"""
    detected_adverb: str | None    # 감지된 부사
    score: float                   # 강도 점수 (0.0 ~ 1.0)
    category: str                  # 카테고리 (severe, moderate 등)
    is_negation: bool              # 부정 표현 여부
    is_progressive: bool           # 진행형 여부 (점점 더)


class ExpressionMatcher:
    """
    확장 표현 매처

    의성어/의태어, 은유 표현, 정도 부사를 분석하여
    텍스트 검색 품질을 향상시키는 범용 모듈.
    도메인별 사전을 주입하여 사용합니다.
    """

    def __init__(
        self,
        onomatopoeia: dict[str, Any] | None = None,
        metaphors: dict[str, Any] | None = None,
        intensity_adverbs: dict[str, Any] | None = None,
        category_to_ids: dict[str, list[str]] | None = None,
    ):
        """
        Args:
            onomatopoeia: 의성어/의태어 사전.
                키: 표현 문자열, 값: {category, intensity, description, candidate_ids, ...}
                또는 {"onomatopoeia": {...}} 형태도 지원
            metaphors: 은유 표현 사전.
                {"metaphor_categories": {카테고리: {description, patterns: [{pattern, intensity, urgency, category}]}},
                 "alerts": [{trigger: [...], alert, urgency}]}
            intensity_adverbs: 정도 부사 사전.
                키: 부사, 값: {score, category, synonyms?}
                또는 {"intensity_adverbs": {...}} 형태도 지원
            category_to_ids: 카테고리 -> ID 목록 매핑 (기본: 빈 딕셔너리)
        """
        # 카테고리 → ID 매핑
        self.category_to_ids = category_to_ids or {}

        # 의성어/의태어 데이터
        self.onomatopoeia = {}
        if onomatopoeia:
            self.onomatopoeia = onomatopoeia.get("onomatopoeia", onomatopoeia)

        # 은유 표현 데이터
        self.metaphor_categories = {}
        self.alerts = []
        if metaphors:
            self.metaphor_categories = metaphors.get("metaphor_categories", {})
            self.alerts = metaphors.get("alerts", [])

        # 정도 부사 데이터
        self.intensity_adverbs = {}
        if intensity_adverbs:
            self.intensity_adverbs = intensity_adverbs.get("intensity_adverbs", intensity_adverbs)

        # 정렬된 패턴 (긴 것 우선)
        self._sorted_onomatopoeia = sorted(
            self.onomatopoeia.items(),
            key=lambda x: len(x[0]),
            reverse=True
        )

        # 은유 패턴 정규화
        self._metaphor_patterns: list[tuple[str, dict[str, Any], str]] = []
        for category, info in self.metaphor_categories.items():
            for pattern_info in info.get("patterns", []):
                self._metaphor_patterns.append((
                    pattern_info["pattern"],
                    pattern_info,
                    category
                ))
        # 긴 패턴 우선 정렬
        self._metaphor_patterns.sort(key=lambda x: len(x[0]), reverse=True)

    def match_expression(self, text: str) -> list[ExpressionMatch]:
        """
        텍스트에서 모든 표현 매칭

        Args:
            text: 입력 텍스트

        Returns:
            매칭된 표현 목록
        """
        results: list[ExpressionMatch] = []
        text_lower = text.lower()

        # 1. 의성어/의태어 매칭
        ono_matches = self._match_onomatopoeia(text_lower)
        results.extend(ono_matches)

        # 2. 은유 표현 매칭
        metaphor_matches = self._match_metaphors(text_lower)
        results.extend(metaphor_matches)

        return results

    def _match_onomatopoeia(self, text: str) -> list[ExpressionMatch]:
        """의성어/의태어 매칭"""
        results: list[ExpressionMatch] = []
        matched_positions: set[tuple[int, int]] = set()

        for ono_term, info in self._sorted_onomatopoeia:
            start = 0
            while True:
                pos = text.find(ono_term, start)
                if pos == -1:
                    break

                end_pos = pos + len(ono_term)

                # 이미 매칭된 위치와 겹치는지 확인
                overlaps = any(
                    not (end_pos <= s or pos >= e)
                    for s, e in matched_positions
                )

                if not overlaps:
                    matched_positions.add((pos, end_pos))

                    intensity = info.get("intensity", 0.5)
                    if intensity >= 0.9:
                        urgency = UrgencyLevel.HIGH
                    elif intensity >= 0.7:
                        urgency = UrgencyLevel.MEDIUM
                    else:
                        urgency = UrgencyLevel.LOW

                    # 카테고리에서 ID 후보 조회
                    cat = info.get("category", "unknown")
                    candidate_ids = info.get("candidate_ids", self.category_to_ids.get(cat, []))

                    results.append(ExpressionMatch(
                        matched_text=ono_term,
                        expression_type=ExpressionType.ONOMATOPOEIA,
                        category=cat,
                        candidate_ids=candidate_ids,
                        intensity=intensity,
                        urgency=urgency,
                        confidence=0.85,
                        description=info.get("description", ""),
                    ))

                start = end_pos

        return results

    def _match_metaphors(self, text: str) -> list[ExpressionMatch]:
        """은유 표현 매칭"""
        results: list[ExpressionMatch] = []
        matched_positions: set[tuple[int, int]] = set()

        for pattern, info, category in self._metaphor_patterns:
            start = 0
            while True:
                pos = text.find(pattern, start)
                if pos == -1:
                    break

                end_pos = pos + len(pattern)

                overlaps = any(
                    not (end_pos <= s or pos >= e)
                    for s, e in matched_positions
                )

                if not overlaps:
                    matched_positions.add((pos, end_pos))

                    urgency_str = info.get("urgency", "medium")
                    try:
                        urgency = UrgencyLevel(urgency_str)
                    except ValueError:
                        urgency = UrgencyLevel.MEDIUM

                    category_info = self.metaphor_categories.get(category, {})

                    cat = info.get("category", category)
                    candidate_ids = info.get("candidate_ids", self.category_to_ids.get(cat, []))

                    results.append(ExpressionMatch(
                        matched_text=pattern,
                        expression_type=ExpressionType.METAPHOR,
                        category=cat,
                        candidate_ids=candidate_ids,
                        intensity=info.get("intensity", 0.5),
                        urgency=urgency,
                        confidence=0.8,
                        description=category_info.get("description", ""),
                        note=category_info.get("note"),
                    ))

                start = end_pos

        return results

    def analyze_intensity(self, text: str) -> IntensityAnalysis:
        """
        텍스트의 강도 분석

        Args:
            text: 입력 텍스트

        Returns:
            IntensityAnalysis
        """
        text_lower = text.lower()

        detected_adverb = None
        detected_score = None
        category = "moderate"
        is_negation = False
        is_progressive = False

        for adverb, info in self.intensity_adverbs.items():
            score = info.get("score", 0.5)
            adv_category = info.get("category", "moderate")

            # 부정 표현 체크
            if adv_category in ("negation", "complete_negation", "mild_negation"):
                if adverb in text_lower:
                    is_negation = True
                continue

            # 진행형 체크
            if adv_category == "progressive":
                if adverb in text_lower:
                    is_progressive = True
                continue

            # 메인 부사 체크
            if adverb in text_lower:
                if detected_score is None or score > detected_score:
                    detected_score = score
                    detected_adverb = adverb
                    category = adv_category
                continue

            # 동의어 체크
            for synonym in info.get("synonyms", []):
                if synonym in text_lower:
                    if detected_score is None or score > detected_score:
                        detected_score = score
                        detected_adverb = synonym
                        category = adv_category
                    break

        final_score = detected_score if detected_score is not None else 0.5

        return IntensityAnalysis(
            detected_adverb=detected_adverb,
            score=final_score,
            category=category,
            is_negation=is_negation,
            is_progressive=is_progressive,
        )

    def check_alerts(self, text: str) -> list[dict[str, Any]]:
        """
        도메인 경고 확인 (사전에 등록된 트리거 패턴 매칭)

        Args:
            text: 입력 텍스트

        Returns:
            트리거된 경고 목록
        """
        triggered: list[dict[str, Any]] = []
        text_lower = text.lower()

        for alert in self.alerts:
            triggers = alert.get("trigger", [])

            # 모든 트리거가 텍스트에 있는지 확인
            if all(trigger in text_lower for trigger in triggers):
                triggered.append({
                    "trigger": triggers,
                    "alert": alert.get("alert", ""),
                    "urgency": alert.get("urgency", "medium"),
                })

        return triggered

    def get_ids_from_expression(self, text: str) -> list[tuple[str, float, str]]:
        """
        표현에서 후보 ID 추출

        Args:
            text: 입력 텍스트

        Returns:
            (id, confidence, source) 튜플 목록
        """
        results: list[tuple[str, float, str]] = []
        seen_ids: set[str] = set()

        matches = self.match_expression(text)

        for match in matches:
            for cid in match.candidate_ids:
                if cid not in seen_ids:
                    seen_ids.add(cid)
                    results.append((
                        cid,
                        match.confidence * match.intensity,
                        match.expression_type.value
                    ))

        return results


def create_expression_matcher(
    onomatopoeia: dict[str, Any] | None = None,
    metaphors: dict[str, Any] | None = None,
    intensity_adverbs: dict[str, Any] | None = None,
    category_to_ids: dict[str, list[str]] | None = None,
) -> ExpressionMatcher:
    """
    ExpressionMatcher 팩토리 함수

    Args:
        onomatopoeia: 의성어/의태어 사전
        metaphors: 은유 표현 사전
        intensity_adverbs: 정도 부사 사전
        category_to_ids: 카테고리 -> ID 목록 매핑

    Returns:
        ExpressionMatcher 인스턴스
    """
    return ExpressionMatcher(
        onomatopoeia=onomatopoeia,
        metaphors=metaphors,
        intensity_adverbs=intensity_adverbs,
        category_to_ids=category_to_ids,
    )