@source: 260313 heath-infer-step01

02-reusable-code-python/utils/fuzzy_match.py
reusable
python
"""
@source: 260313 heath-infer-step01
@extracted: 2026-03-14
@description: 한글 자모 분리 + Levenshtein 기반 퍼지 매칭 엔진.
              다중 레이어 매칭 (정확 -> 접두어 -> 부분문자열 -> 퍼지) 지원.

사용법:
    # 딕셔너리 기반 매칭
    matcher = FuzzyMatcher(
        dictionary={"두통": "T001", "편두통": "T002"},
        min_fuzzy_threshold=0.6,
    )
    result = matcher.match("두퉁")   # 한글 오타도 매칭
    results = matcher.search("두", limit=5)

    # 단순 유사도 계산
    score = hangul_levenshtein("두통", "두퉁")  # 0.85+

    # 오타 교정
    corrected = matcher.correct_typo("두퉁", threshold=0.7)  # "두통"
"""
from dataclasses import dataclass
from enum import Enum
from typing import Callable


# ============================================================
# 한글 유니코드 상수
# ============================================================

# 한글 유니코드 범위
HANGUL_BASE = 0xAC00  # '가'
HANGUL_END = 0xD7A3   # '힣'

# 초성 (19개)
CHOSEONG = [
    'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ',
    'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]

# 중성 (21개)
JUNGSEONG = [
    'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ',
    'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ'
]

# 종성 (28개, 첫 번째는 종성 없음)
JONGSEONG = [
    '', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ',
    'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ',
    'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
]


# ============================================================
# 한글 자모 분리 함수
# ============================================================

def decompose_hangul(text: str) -> str:
    """
    한글 문자를 자모 단위로 분리

    예시:
        "두통" -> "ㄷㅜㅌㅗㅇ"
        "머리" -> "ㅁㅓㄹㅣ"

    Args:
        text: 입력 문자열

    Returns:
        자모 분리된 문자열
    """
    result = []

    for char in text:
        code = ord(char)

        # 한글 완성형 범위인 경우
        if HANGUL_BASE <= code <= HANGUL_END:
            # 유니코드 계산
            offset = code - HANGUL_BASE
            cho_idx = offset // (21 * 28)
            jung_idx = (offset % (21 * 28)) // 28
            jong_idx = offset % 28

            # 자모 추가
            result.append(CHOSEONG[cho_idx])
            result.append(JUNGSEONG[jung_idx])
            if jong_idx > 0:  # 종성이 있는 경우
                result.append(JONGSEONG[jong_idx])
        else:
            # 한글이 아닌 경우 그대로 추가
            result.append(char)

    return ''.join(result)


def is_hangul(char: str) -> bool:
    """한글 문자 여부 확인"""
    code = ord(char) if len(char) == 1 else 0
    return HANGUL_BASE <= code <= HANGUL_END


# ============================================================
# Levenshtein 거리 계산
# ============================================================

def levenshtein_distance(s1: str, s2: str) -> int:
    """
    두 문자열 간의 Levenshtein (편집) 거리 계산

    Args:
        s1: 첫 번째 문자열
        s2: 두 번째 문자열

    Returns:
        편집 거리 (삽입, 삭제, 치환 횟수)
    """
    if len(s1) < len(s2):
        s1, s2 = s2, s1

    if len(s2) == 0:
        return len(s1)

    # 이전 행과 현재 행만 유지 (메모리 최적화)
    previous_row = list(range(len(s2) + 1))

    for i, c1 in enumerate(s1):
        current_row = [i + 1]

        for j, c2 in enumerate(s2):
            # 삽입, 삭제, 치환 중 최소 비용
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)

            current_row.append(min(insertions, deletions, substitutions))

        previous_row = current_row

    return previous_row[-1]


def normalized_levenshtein(s1: str, s2: str) -> float:
    """
    정규화된 Levenshtein 유사도 (0.0 ~ 1.0)

    1.0 = 완전 일치, 0.0 = 완전 불일치
    """
    if not s1 and not s2:
        return 1.0

    max_len = max(len(s1), len(s2))
    if max_len == 0:
        return 1.0

    distance = levenshtein_distance(s1, s2)
    return 1.0 - (distance / max_len)


def hangul_levenshtein(s1: str, s2: str) -> float:
    """
    한글 자모 분리 기반 Levenshtein 유사도

    일반 문자열 비교보다 한글 오타에 강함
    예: "두통" vs "두퉁" -> 자모 비교로 높은 유사도
    """
    decomposed1 = decompose_hangul(s1)
    decomposed2 = decompose_hangul(s2)

    return normalized_levenshtein(decomposed1, decomposed2)


# ============================================================
# 매칭 타입 및 결과
# ============================================================

class MatchType(Enum):
    """매칭 유형"""
    EXACT = "exact"           # 정확 일치
    PREFIX = "prefix"         # 접두어 일치
    SUBSTRING = "substring"   # 부분 문자열 일치
    FUZZY = "fuzzy"           # 퍼지 매칭
    NONE = "none"             # 매칭 실패


@dataclass
class MatchResult:
    """매칭 결과"""
    term: str                 # 매칭된 용어
    matched_id: str           # 매칭된 ID (딕셔너리 value)
    score: float              # 유사도 점수 (0.0 ~ 1.0)
    match_type: MatchType     # 매칭 유형

    @property
    def confidence(self) -> float:
        """신뢰도 점수 (match_type 가중치 적용)"""
        type_weights = {
            MatchType.EXACT: 1.0,
            MatchType.PREFIX: 0.95,
            MatchType.SUBSTRING: 0.85,
            MatchType.FUZZY: 0.75,
            MatchType.NONE: 0.0,
        }
        return self.score * type_weights.get(self.match_type, 0.5)


# ============================================================
# 퍼지 매처 클래스
# ============================================================

class FuzzyMatcher:
    """
    다중 레이어 퍼지 매칭 엔진

    매칭 순서:
    1. 정확 매칭 (exact)
    2. 접두어 매칭 (prefix)
    3. 부분 문자열 매칭 (substring)
    4. 퍼지 매칭 (fuzzy - Levenshtein)
    """

    def __init__(
        self,
        dictionary: dict[str, str],
        min_fuzzy_threshold: float = 0.6,
        use_jamo: bool = True,
    ):
        """
        Args:
            dictionary: 용어 -> ID 매핑 딕셔너리
            min_fuzzy_threshold: 퍼지 매칭 최소 임계값 (0.0 ~ 1.0)
            use_jamo: 한글 자모 분리 사용 여부
        """
        self.dictionary = dictionary
        self.min_fuzzy_threshold = min_fuzzy_threshold
        self.use_jamo = use_jamo

        # 소문자 변환된 인덱스 생성 (빠른 검색용)
        self._lower_index: dict[str, tuple[str, str]] = {}
        for term, term_id in dictionary.items():
            self._lower_index[term.lower()] = (term, term_id)

    def match(self, query: str) -> MatchResult | None:
        """
        단일 쿼리에 대한 최적 매칭 수행

        Args:
            query: 검색어

        Returns:
            MatchResult 또는 None (매칭 실패)
        """
        if not query or not query.strip():
            return None

        query = query.strip()
        query_lower = query.lower()

        # 1. 정확 매칭
        if query_lower in self._lower_index:
            term, term_id = self._lower_index[query_lower]
            return MatchResult(
                term=term,
                matched_id=term_id,
                score=1.0,
                match_type=MatchType.EXACT,
            )

        # 2. 접두어 매칭
        prefix_matches = []
        for term_lower, (term, term_id) in self._lower_index.items():
            if term_lower.startswith(query_lower):
                score = len(query_lower) / len(term_lower)
                prefix_matches.append(MatchResult(
                    term=term,
                    matched_id=term_id,
                    score=score,
                    match_type=MatchType.PREFIX,
                ))

        if prefix_matches:
            return max(prefix_matches, key=lambda x: x.score)

        # 3. 부분 문자열 매칭
        substring_matches = []
        for term_lower, (term, term_id) in self._lower_index.items():
            if query_lower in term_lower:
                score = len(query_lower) / len(term_lower)
                substring_matches.append(MatchResult(
                    term=term,
                    matched_id=term_id,
                    score=score,
                    match_type=MatchType.SUBSTRING,
                ))

        if substring_matches:
            return max(substring_matches, key=lambda x: x.score)

        # 4. 퍼지 매칭
        return self._fuzzy_match(query)

    def _fuzzy_match(self, query: str) -> MatchResult | None:
        """퍼지 매칭 수행"""
        best_match: MatchResult | None = None
        best_score = 0.0

        query_lower = query.lower()

        for term_lower, (term, term_id) in self._lower_index.items():
            # 한글 자모 기반 유사도
            if self.use_jamo:
                score = hangul_levenshtein(query_lower, term_lower)
            else:
                score = normalized_levenshtein(query_lower, term_lower)

            if score >= self.min_fuzzy_threshold and score > best_score:
                best_score = score
                best_match = MatchResult(
                    term=term,
                    matched_id=term_id,
                    score=score,
                    match_type=MatchType.FUZZY,
                )

        return best_match

    def search(
        self,
        query: str,
        limit: int = 10,
        min_score: float = 0.0,
    ) -> list[MatchResult]:
        """
        검색 쿼리에 대해 유사도 순으로 정렬된 결과 반환

        Args:
            query: 검색어
            limit: 최대 반환 개수
            min_score: 최소 점수 임계값

        Returns:
            유사도 순으로 정렬된 MatchResult 리스트
        """
        if not query or not query.strip():
            return []

        query = query.strip()
        query_lower = query.lower()

        results: list[MatchResult] = []
        seen_ids: set[str] = set()

        for term_lower, (term, term_id) in self._lower_index.items():
            # 중복 ID 제외
            if term_id in seen_ids:
                continue

            # 매칭 유형 및 점수 결정
            match_type, score = self._calculate_match_score(
                query_lower, term_lower
            )

            if score >= min_score:
                results.append(MatchResult(
                    term=term,
                    matched_id=term_id,
                    score=score,
                    match_type=match_type,
                ))
                seen_ids.add(term_id)

        # 신뢰도(confidence) 기준 정렬
        results.sort(key=lambda x: x.confidence, reverse=True)

        return results[:limit]

    def _calculate_match_score(
        self,
        query: str,
        term: str,
    ) -> tuple[MatchType, float]:
        """쿼리와 용어 간의 매칭 유형 및 점수 계산"""
        # 정확 매칭
        if query == term:
            return MatchType.EXACT, 1.0

        # 접두어 매칭
        if term.startswith(query):
            score = len(query) / len(term)
            return MatchType.PREFIX, score

        # 부분 문자열 매칭
        if query in term:
            score = len(query) / len(term)
            return MatchType.SUBSTRING, score

        # 퍼지 매칭
        if self.use_jamo:
            score = hangul_levenshtein(query, term)
        else:
            score = normalized_levenshtein(query, term)

        if score >= self.min_fuzzy_threshold:
            return MatchType.FUZZY, score

        return MatchType.NONE, 0.0

    def correct_typo(
        self,
        query: str,
        threshold: float = 0.7,
    ) -> str | None:
        """
        오타 교정

        Args:
            query: 입력 문자열
            threshold: 최소 유사도 임계값

        Returns:
            교정된 용어 또는 None
        """
        result = self.match(query)

        if result and result.score >= threshold:
            return result.term

        return None


# ============================================================
# 편의 함수
# ============================================================

def create_matcher(
    dictionary: dict[str, str],
    min_threshold: float = 0.6,
    use_jamo: bool = True,
) -> FuzzyMatcher:
    """
    FuzzyMatcher 팩토리 함수

    Args:
        dictionary: 용어 -> ID 매핑 딕셔너리
        min_threshold: 최소 퍼지 매칭 임계값
        use_jamo: 한글 자모 분리 사용 여부

    Returns:
        FuzzyMatcher 인스턴스
    """
    return FuzzyMatcher(
        dictionary=dictionary,
        min_fuzzy_threshold=min_threshold,
        use_jamo=use_jamo,
    )


def score_match(
    query: str,
    candidate: str,
    use_jamo: bool = True,
) -> tuple[MatchType, float]:
    """
    단일 쿼리-후보 쌍의 매칭 점수 계산

    Args:
        query: 검색어
        candidate: 후보 용어
        use_jamo: 한글 자모 분리 사용 여부

    Returns:
        (매칭 유형, 점수)
    """
    query_lower = query.lower()
    candidate_lower = candidate.lower()

    # 정확 매칭
    if query_lower == candidate_lower:
        return MatchType.EXACT, 1.0

    # 접두어 매칭
    if candidate_lower.startswith(query_lower):
        score = len(query_lower) / len(candidate_lower)
        return MatchType.PREFIX, score

    # 부분 문자열 매칭
    if query_lower in candidate_lower:
        score = len(query_lower) / len(candidate_lower)
        return MatchType.SUBSTRING, score

    # 퍼지 매칭
    if use_jamo:
        score = hangul_levenshtein(query_lower, candidate_lower)
    else:
        score = normalized_levenshtein(query_lower, candidate_lower)

    if score >= 0.5:  # 최소 임계값
        return MatchType.FUZZY, score

    return MatchType.NONE, 0.0