엑셀 파일 읽기 및 데이터 추출 유틸리티

02-reusable-code-python/utils/excel_reader.py
reusable
python
"""
엑셀 파일 읽기 및 데이터 추출 유틸리티

@source kcsi-smpa
@extracted 2026-02-15
@version 1.1.0

의존성:
    - pandas (필수)
    - openpyxl (필수, 엑셀 읽기 엔진)

사용법:
    from utils.excel_reader import read_excel_file, get_sheet_names

    # 엑셀 파일 읽기
    df = read_excel_file("data.xlsx", sheet_name="Sheet1")

    # 시트 이름 목록 조회
    names = get_sheet_names("data.xlsx")

    # 모든 시트 읽기
    sheets = read_all_sheets("data.xlsx")

    # 구조 분석
    info = analyze_excel_structure("data.xlsx")
"""

import logging
from pathlib import Path
from typing import Any

import pandas as pd

logger = logging.getLogger(__name__)


def read_excel_file(
    file_path: str | Path,
    sheet_name: str | int | None = 0,
    header: int = 0,
) -> pd.DataFrame:
    """엑셀 파일을 읽어 DataFrame으로 반환

    Args:
        file_path: 엑셀 파일 경로
        sheet_name: 시트 이름 또는 인덱스 (기본값: 첫 번째 시트)
        header: 헤더 행 인덱스 (기본값: 0)

    Returns:
        pandas DataFrame
    """
    return pd.read_excel(file_path, sheet_name=sheet_name, header=header)


def read_all_sheets(file_path: str | Path) -> dict[str, pd.DataFrame]:
    """엑셀 파일의 모든 시트를 읽어 딕셔너리로 반환

    Args:
        file_path: 엑셀 파일 경로

    Returns:
        {시트명: DataFrame} 딕셔너리
    """
    return pd.read_excel(file_path, sheet_name=None)


def get_sheet_names(file_path: str | Path) -> list[str]:
    """엑셀 파일의 시트 이름 목록 반환

    Args:
        file_path: 엑셀 파일 경로

    Returns:
        시트 이름 리스트
    """
    from openpyxl import load_workbook

    wb = load_workbook(file_path, read_only=True)
    try:
        return wb.sheetnames
    finally:
        wb.close()


def extract_table_data(
    df: pd.DataFrame,
    start_row: int = 0,
    end_row: int | None = None,
    columns: list[str] | None = None,
) -> list[dict[str, Any]]:
    """DataFrame에서 테이블 데이터를 딕셔너리 리스트로 추출

    Args:
        df: pandas DataFrame
        start_row: 시작 행 (기본값: 0)
        end_row: 끝 행 (기본값: None, 마지막까지)
        columns: 추출할 컬럼 목록 (기본값: None, 모든 컬럼)

    Returns:
        딕셔너리 리스트
    """
    if columns:
        df = df[columns]

    if end_row:
        df = df.iloc[start_row:end_row]
    else:
        df = df.iloc[start_row:]

    return df.to_dict('records')


def analyze_excel_structure(file_path: str | Path) -> dict[str, Any]:
    """엑셀 파일 구조 분석

    Args:
        file_path: 엑셀 파일 경로

    Returns:
        파일 구조 정보 딕셔너리
    """
    from openpyxl import load_workbook

    wb = load_workbook(file_path, read_only=True)
    try:
        structure = {
            "file_path": str(file_path),
            "sheet_count": len(wb.sheetnames),
            "sheets": [],
        }

        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            sheet_info = {
                "name": sheet_name,
                "max_row": ws.max_row,
                "max_column": ws.max_column,
            }
            structure["sheets"].append(sheet_info)

        return structure
    finally:
        wb.close()