뉴스 데이터 트렌드 추출 및 기회 점수 분석기.

02-reusable-code-python/news/analyzer.py
reusable
python
"""
뉴스 데이터 트렌드 추출 및 기회 점수 분석기.

수집된 뉴스 항목에서 키워드 빈도를 분석하여 트렌드 시그널을 도출하고,
트렌드 클러스터 기반으로 앱 개발 기회를 점수화한다.

@source: 00-general-pro
@extracted: 2026-03-08
@version: 1.0.0

의존성:
    - pydantic >= 2.0

사용법:
    ```python
    from news.analyzer import TrendAnalyzer
    from news.models import CollectionResult

    analyzer = TrendAnalyzer(
        min_frequency=2,
        top_trends=20,
        top_opportunities=5,
    )
    report = analyzer.analyze(collection_result)
    print(f"트렌드: {len(report.trends)}개")
    ```
"""

import logging
import re
import time
from collections import Counter
from datetime import datetime

from .models import (
    AnalysisReport,
    AppOpportunity,
    CollectionResult,
    NewsItem,
    TrendSignal,
)

logger = logging.getLogger(__name__)

# ============================================
# 상수
# ============================================

# 영어 불용어
_ENGLISH_STOPWORDS: set[str] = {
    "the", "a", "an", "is", "are", "was", "were",
    "in", "on", "at", "to", "for", "of", "with",
    "and", "or", "but", "not", "this", "that",
    "it", "i", "my", "your", "how", "what",
    "why", "when", "where", "show", "hn", "ask",
    "be", "been", "being", "have", "has", "had",
    "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "can", "shall",
    "its", "his", "her", "their", "our", "we",
    "they", "you", "he", "she", "me", "us",
    "them", "who", "which", "from", "by", "about",
    "up", "out", "if", "than", "so", "no", "just",
    "more", "also", "very", "too", "any", "each",
    "all", "some", "most", "other", "into", "over",
    "new", "now", "get", "got", "make", "made",
    "use", "using", "used", "one", "two", "via",
    "vs", "re", "way", "even", "still", "need",
    "like", "go", "going", "first", "after", "before",
}

# 한국어 불용어
_KOREAN_STOPWORDS: set[str] = {
    "있는", "하는", "되는", "이는", "에서", "으로",
    "그리고", "하지만", "또는", "그러나", "위해",
    "대한", "통해", "것을", "것이", "것은", "있다",
}

_ALL_STOPWORDS = _ENGLISH_STOPWORDS | _KOREAN_STOPWORDS

# 키워드 → 카테고리 매핑
_KEYWORD_CATEGORIES: dict[str, str] = {
    # AI 관련
    "ai": "ai", "llm": "ai", "gpt": "ai",
    "openai": "ai", "claude": "ai", "gemini": "ai",
    "copilot": "ai", "chatgpt": "ai", "ml": "ai",
    "machine": "ai", "learning": "ai", "neural": "ai",
    "model": "ai", "transformer": "ai", "diffusion": "ai",
    "rag": "ai", "agent": "ai", "agents": "ai",
    "fine-tuning": "ai", "inference": "ai",
    "deep": "ai", "training": "ai",
    # 웹 관련
    "react": "web", "nextjs": "web", "vue": "web",
    "svelte": "web", "angular": "web", "javascript": "web",
    "typescript": "web", "css": "web", "html": "web",
    "frontend": "web", "backend": "web", "fullstack": "web",
    "api": "web", "rest": "web", "graphql": "web",
    "wasm": "web", "webassembly": "web",
    # 모바일 관련
    "mobile": "mobile", "ios": "mobile", "android": "mobile",
    "flutter": "mobile", "swift": "mobile", "kotlin": "mobile",
    "react-native": "mobile",
    # 개발도구
    "rust": "devtool", "go": "devtool", "python": "devtool",
    "docker": "devtool", "kubernetes": "devtool",
    "devtools": "devtool", "cli": "devtool",
    "editor": "devtool", "ide": "devtool",
    "vscode": "devtool", "git": "devtool",
    "terminal": "devtool", "compiler": "devtool",
    # SaaS 관련
    "saas": "saas", "startup": "saas", "b2b": "saas",
    "productivity": "saas", "platform": "saas",
    "subscription": "saas", "pricing": "saas",
    # 핀테크
    "fintech": "fintech", "crypto": "fintech",
    "blockchain": "fintech", "bitcoin": "fintech",
    "ethereum": "fintech", "payment": "fintech",
    "trading": "fintech", "defi": "fintech",
    # 인프라/클라우드
    "cloud": "infra", "aws": "infra", "gcp": "infra",
    "azure": "infra", "serverless": "infra",
    "database": "infra", "postgres": "infra",
    "redis": "infra", "sqlite": "infra",
    # 보안
    "security": "security", "privacy": "security",
    "encryption": "security", "auth": "security",
    "vulnerability": "security",
    # 오픈소스
    "open-source": "opensource", "opensource": "opensource",
    "github": "opensource", "oss": "opensource",
}

# 카테고리별 기본 추천 스택
_CATEGORY_STACKS: dict[str, list[str]] = {
    "ai": ["Python", "FastAPI", "LangChain", "Next.js"],
    "web": ["Next.js", "TypeScript", "Tailwind CSS"],
    "mobile": ["React Native", "TypeScript", "Expo"],
    "devtool": ["Rust", "TypeScript", "Electron"],
    "saas": [
        "Next.js", "TypeScript", "Supabase", "Stripe",
    ],
    "fintech": [
        "Next.js", "TypeScript", "Supabase", "Plaid",
    ],
    "infra": ["Go", "Docker", "Terraform"],
    "security": ["Rust", "TypeScript", "Next.js"],
    "opensource": ["TypeScript", "Rust", "GitHub Actions"],
}

# 토큰화용 정규식 (알파벳, 숫자, 하이픈 포함 단어)
_TOKEN_PATTERN = re.compile(r"[a-zA-Z0-9\-]+")


# ============================================
# 트렌드 분석기
# ============================================


class TrendAnalyzer:
    """뉴스 데이터 트렌드 분석기.

    수집된 뉴스 제목에서 키워드를 추출하고, 빈도·성장률 기반으로
    트렌드 시그널을 도출한 뒤 앱 개발 기회를 점수화한다.

    Attributes:
        min_frequency: 최소 키워드 등장 횟수.
        top_trends: 추출할 트렌드 수.
        top_opportunities: 추출할 기회 수.
    """

    def __init__(
        self,
        min_frequency: int = 2,
        top_trends: int = 20,
        top_opportunities: int = 5,
    ) -> None:
        """초기화.

        Args:
            min_frequency: 최소 키워드 등장 횟수. 기본 2.
            top_trends: 추출할 트렌드 수. 기본 20.
            top_opportunities: 추출할 기회 수. 기본 5.
        """
        self.min_frequency = min_frequency
        self.top_trends = top_trends
        self.top_opportunities = top_opportunities

    def analyze(
        self, result: CollectionResult
    ) -> AnalysisReport:
        """수집 결과를 분석하여 종합 보고서를 생성한다.

        Args:
            result: 뉴스 수집 결과.

        Returns:
            트렌드 시그널과 앱 기회를 포함한 종합 분석 보고서.
        """
        start = time.monotonic()
        items = result.items

        if not items:
            logger.warning("분석할 뉴스 항목이 없음")
            return AnalysisReport(
                trends=[],
                opportunities=[],
                raw_item_count=0,
            )

        logger.info("분석 시작: %d건의 뉴스 항목", len(items))

        # 1. 키워드 추출
        keyword_counts = self._extract_keywords(items)
        logger.info(
            "키워드 추출 완료: %d개 (min_freq=%d 이상)",
            len(
                {
                    k: v
                    for k, v in keyword_counts.items()
                    if v >= self.min_frequency
                }
            ),
            self.min_frequency,
        )

        # 2. 트렌드 시그널 생성
        trends = self._build_trends(keyword_counts, items)
        logger.info("트렌드 시그널 생성 완료: %d개", len(trends))

        # 3. 앱 기회 점수화
        opportunities = self._score_opportunities(
            trends, items
        )
        logger.info("앱 기회 도출 완료: %d개", len(opportunities))

        duration = time.monotonic() - start
        logger.info("분석 완료 (%.2f초)", duration)

        return AnalysisReport(
            trends=trends,
            opportunities=opportunities,
            raw_item_count=len(items),
            analysis_duration_seconds=round(duration, 2),
            domains=list(
                {
                    _KEYWORD_CATEGORIES.get(
                        t.keyword.lower(), ""
                    )
                    for t in trends
                    if _KEYWORD_CATEGORIES.get(
                        t.keyword.lower()
                    )
                }
            ),
        )

    def _extract_keywords(
        self, items: list[NewsItem]
    ) -> dict[str, int]:
        """뉴스 제목에서 키워드를 추출하고 빈도를 카운트한다.

        제목을 소문자화 → 토큰화 → 불용어 제거 → 빈도 카운트.
        2글자 이상 토큰만 대상으로 한다.

        Args:
            items: 뉴스 항목 목록.

        Returns:
            키워드별 등장 횟수 딕셔너리.
        """
        counter: Counter[str] = Counter()
        for item in items:
            title_lower = item.title.lower()
            tokens = _TOKEN_PATTERN.findall(title_lower)
            for token in tokens:
                if len(token) >= 2 and token not in _ALL_STOPWORDS:
                    counter[token] += 1
        return dict(counter)

    def _build_trends(
        self,
        keyword_counts: dict[str, int],
        items: list[NewsItem],
    ) -> list[TrendSignal]:
        """키워드 빈도를 TrendSignal로 변환한다.

        min_frequency 이상인 키워드만 대상으로 하며,
        기간 전반·후반 비교로 growth_rate를 산출한다.

        Args:
            keyword_counts: 키워드별 등장 횟수.
            items: 원본 뉴스 항목 목록.

        Returns:
            빈도 내림차순 정렬된 TrendSignal 목록.
        """
        # 날짜 기반 전반/후반 분리
        timestamps = [
            item.published_at
            for item in items
            if item.published_at is not None
        ]
        midpoint: datetime | None = None
        if timestamps:
            sorted_ts = sorted(timestamps)
            mid_idx = len(sorted_ts) // 2
            midpoint = sorted_ts[mid_idx]

        # 키워드별 소스, 관련 URL, 전반/후반 빈도 수집
        keyword_meta: dict[str, dict] = {}
        for item in items:
            title_lower = item.title.lower()
            tokens = set(_TOKEN_PATTERN.findall(title_lower))
            for token in tokens:
                if token not in keyword_counts:
                    continue
                if keyword_counts[token] < self.min_frequency:
                    continue
                if token not in keyword_meta:
                    keyword_meta[token] = {
                        "sources": set(),
                        "urls": [],
                        "first_half": 0,
                        "second_half": 0,
                    }
                meta = keyword_meta[token]
                meta["sources"].add(item.source)
                meta["urls"].append(item.url)
                # 전반/후반 빈도
                if midpoint and item.published_at:
                    if item.published_at <= midpoint:
                        meta["first_half"] += 1
                    else:
                        meta["second_half"] += 1

        # TrendSignal 생성
        trends: list[TrendSignal] = []
        for keyword, count in sorted(
            keyword_counts.items(),
            key=lambda x: x[1],
            reverse=True,
        ):
            if count < self.min_frequency:
                continue
            if keyword not in keyword_meta:
                continue
            if len(trends) >= self.top_trends:
                break

            meta = keyword_meta[keyword]

            # 성장률 계산: (후반 - 전반) / max(전반, 1)
            first = meta["first_half"]
            second = meta["second_half"]
            growth_rate = 0.0
            if first > 0 or second > 0:
                growth_rate = (second - first) / max(first, 1)
                growth_rate = max(-1.0, min(1.0, growth_rate))

            # 카테고리 매핑
            category = _KEYWORD_CATEGORIES.get(
                keyword, ""
            )

            # 트렌드 단계 판별
            stage = self._determine_stage(
                count, growth_rate
            )

            trends.append(
                TrendSignal(
                    keyword=keyword,
                    frequency=count,
                    sources=sorted(meta["sources"]),
                    growth_rate=round(growth_rate, 3),
                    category=category,
                    related_items=meta["urls"][
                        :10
                    ],  # 최대 10개
                    stage=stage,
                )
            )

        return trends

    def _determine_stage(
        self, frequency: int, growth_rate: float
    ) -> str:
        """빈도와 성장률 기반으로 트렌드 단계를 판별한다.

        Args:
            frequency: 키워드 등장 횟수.
            growth_rate: 성장률 (-1 ~ 1).

        Returns:
            트렌드 단계 문자열
            (emerging, growing, mainstream, declining).
        """
        if growth_rate > 0.3 and frequency <= 5:
            return "emerging"
        elif growth_rate > 0.1:
            return "growing"
        elif growth_rate < -0.2:
            return "declining"
        else:
            return "mainstream"

    def _score_opportunities(
        self,
        trends: list[TrendSignal],
        items: list[NewsItem],
    ) -> list[AppOpportunity]:
        """트렌드 클러스터에서 앱 개발 기회를 도출한다.

        상위 트렌드를 카테고리별로 클러스터링하고,
        관련 트렌드 수와 경쟁 강도로 점수를 산출한다.

        Args:
            trends: 트렌드 시그널 목록.
            items: 원본 뉴스 항목 목록.

        Returns:
            점수 내림차순 정렬된 AppOpportunity 목록.
        """
        if not trends:
            return []

        # 카테고리별 트렌드 클러스터링
        clusters: dict[str, list[TrendSignal]] = {}
        for trend in trends:
            cat = trend.category or "general"
            if cat not in clusters:
                clusters[cat] = []
            clusters[cat].append(trend)

        opportunities: list[AppOpportunity] = []
        for cat, cat_trends in clusters.items():
            if not cat_trends:
                continue

            # 상위 키워드로 기회 제목 생성
            top_keywords = [
                t.keyword for t in cat_trends[:3]
            ]
            title = (
                f"{cat.upper()} - "
                + " + ".join(top_keywords)
                + " 기반 앱"
            )

            # 수요 신호: 높은 빈도 키워드
            demand_signals = [
                f"{t.keyword} (빈도: {t.frequency},"
                f" 성장: {t.growth_rate:+.1%})"
                for t in cat_trends[:5]
            ]

            # 점수 산출
            # 기술 실현성: 카테고리에 따라 기본값 설정
            tech_feasibility = 7.0
            if cat in ("ai", "fintech", "security"):
                tech_feasibility = 6.0
            elif cat in ("web", "saas"):
                tech_feasibility = 8.0

            # 시장 수요: 관련 트렌드 수 기반
            market_demand = min(
                10.0, len(cat_trends) * 2.0
            )

            # 차별화: growing/emerging 트렌드 비율
            emerging_count = sum(
                1
                for t in cat_trends
                if t.stage in ("emerging", "growing")
            )
            differentiation = min(
                10.0,
                (emerging_count / max(len(cat_trends), 1))
                * 10.0,
            )

            # 타이밍: 평균 성장률 기반
            avg_growth = sum(
                t.growth_rate for t in cat_trends
            ) / max(len(cat_trends), 1)
            timing = min(
                10.0, max(0.0, 5.0 + avg_growth * 5.0)
            )

            # 확장성: 소스 다양성
            all_sources: set[str] = set()
            for t in cat_trends:
                all_sources.update(t.sources)
            scalability = min(10.0, len(all_sources) * 3.0)

            # 추천 스택
            suggested_stack = _CATEGORY_STACKS.get(
                cat, ["TypeScript", "Next.js"]
            )

            opportunities.append(
                AppOpportunity(
                    title=title,
                    description=(
                        f"{cat} 도메인의 주요 트렌드"
                        f" ({', '.join(top_keywords)})를"
                        " 결합한 앱 기회. "
                        f"{len(cat_trends)}개 관련 트렌드"
                        " 시그널 감지."
                    ),
                    platform="web",
                    demand_signals=demand_signals,
                    tech_feasibility=round(
                        tech_feasibility, 1
                    ),
                    market_demand=round(market_demand, 1),
                    differentiation=round(
                        differentiation, 1
                    ),
                    timing=round(timing, 1),
                    scalability=round(scalability, 1),
                    competing_products=[],
                    suggested_stack=suggested_stack,
                    related_trends=[
                        t.keyword for t in cat_trends
                    ],
                )
            )

        # total_score 내림차순 정렬 + 상위 N개
        opportunities.sort(
            key=lambda x: x.total_score, reverse=True
        )
        return opportunities[: self.top_opportunities]


# ============================================
# CLI 엔트리포인트
# ============================================


def _parse_args() -> "argparse.Namespace":
    """CLI 인수를 파싱한다."""
    import argparse

    parser = argparse.ArgumentParser(
        description="뉴스 트렌드 분석기",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "예시:\n"
            "  uv run -m news.analyzer"
            " --input raw.json --output analysis.json\n"
            "  cat raw.json | uv run -m news.analyzer"
            " --min-frequency 3"
        ),
    )
    parser.add_argument(
        "--input",
        type=str,
        default=None,
        help="수집 결과 JSON 파일 (기본: stdin)",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=None,
        help="분석 결과 JSON 파일 (기본: stdout)",
    )
    parser.add_argument(
        "--min-frequency",
        type=int,
        default=2,
        help="최소 키워드 빈도 (기본: 2)",
    )
    parser.add_argument(
        "--top-trends",
        type=int,
        default=20,
        help="추출할 트렌드 수 (기본: 20)",
    )
    parser.add_argument(
        "--top-opportunities",
        type=int,
        default=5,
        help="추출할 기회 수 (기본: 5)",
    )
    return parser.parse_args()


def _main() -> None:
    """동기 메인 함수."""
    import json
    import sys
    from pathlib import Path

    args = _parse_args()

    # 입력 읽기
    if args.input:
        input_path = Path(args.input)
        raw_json = input_path.read_text(encoding="utf-8")
    else:
        raw_json = sys.stdin.read()

    data = json.loads(raw_json)
    collection_result = CollectionResult.model_validate(data)

    # 분석 수행
    analyzer = TrendAnalyzer(
        min_frequency=args.min_frequency,
        top_trends=args.top_trends,
        top_opportunities=args.top_opportunities,
    )
    report = analyzer.analyze(collection_result)

    # JSON 직렬화
    output = report.model_dump(mode="json")
    json_str = json.dumps(output, ensure_ascii=False, indent=2)

    if args.output:
        output_path = Path(args.output)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(json_str, encoding="utf-8")
        print(
            f"분석 결과 저장: {output_path}"
            f" (트렌드: {len(report.trends)}개,"
            f" 기회: {len(report.opportunities)}개)"
        )
    else:
        sys.stdout.write(json_str + "\n")


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    )
    _main()