fix(model_optimization): BERT 양자화 코드 수정

c75845c8 · insun park · 8b7aef2a · c75845c8
Commit c75845c8 authored Jun 24, 2025 by insun park
--- a/ai lecture/source_code/12_model_optimization/bert_quantization.py
+++ b/ai lecture/source_code/12_model_optimization/bert_quantization.py
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import os
+import time
+import numpy as np


 def get_model_size(model):
@@ -15,10 +17,42 @@ def get_model_size(model):
    return size_all_mb


+def benchmark_performance(pipe, text, num_runs=100):
+    """
+    파이프라인의 성능(지연 시간 및 처리량)을 벤치마킹합니다.
+    GPU 예열(warm-up) 단계를 포함하여 더 정확한 측정치를 얻습니다.
+
+    :param pipe: 벤치마킹할 Hugging Face 파이프라인
+    :param text: 추론에 사용할 입력 텍스트
+    :param num_runs: 벤치마킹을 위해 실행할 횟수
+    :return: (평균 지연 시간(ms), 처리량(inferences/sec))
+    """
+    latencies = []
+    # GPU 예열(warm-up)을 위해 몇 번의 추론을 미리 실행합니다.
+    print("성능 측정을 위한 예열 중...")
+    for _ in range(10):
+        _ = pipe(text)
+
+    # 본격적인 성능 측정을 시작합니다.
+    print(f"{num_runs}회 추론으로 성능을 측정합니다...")
+    for _ in range(num_runs):
+        start_time = time.time()
+        _ = pipe(text)
+        end_time = time.time()
+        latencies.append((end_time - start_time) * 1000)  # ms 단위로 저장
+
+    # 평균 지연 시간과 처리량을 계산합니다.
+    avg_latency_ms = np.mean(latencies)
+    throughput = 1000 / avg_latency_ms if avg_latency_ms > 0 else 0
+
+    return avg_latency_ms, throughput
+
+
 def main():
    """
    Hugging Face의 사전 학습된 BERT 모델을 로드하고, 8-bit 양자화를 적용한 후
-    원본 모델과 양자화된 모델의 크기를 비교하고 추론 결과를 확인하는 스크립트입니다.
+    원본 모델과 양자화된 모델의 크기, 성능(지연 시간, 처리량)을 비교하고
+    추론 결과를 확인하는 스크립트입니다.
    """
    # 사용할 모델과 디바이스 설정
    model_id = "bert-base-uncased"
@@ -48,50 +82,98 @@ def main():
    # `load_in_8bit=True` 옵션을 사용하여 모델을 8-bit로 양자화합니다.
    # `device_map="auto"`는 모델 레이어를 사용 가능한 디바이스(GPU, CPU, RAM)에 자동으로 분배합니다.
    print("\n--- 2. 8-bit 양자화 모델 로드 중... ---")
+    int8_model = None
+    int8_model_size = 0
    try:
        int8_model = AutoModelForSequenceClassification.from_pretrained(
            model_id, load_in_8bit=True, device_map="auto"
        )
-        int8_model_size = get_model_size(int8_model)
+        # 양자화된 모델의 파라미터는 `int8`이지만, 계산 중에는 다른 자료형을 사용할 수 있어
+        # 정확한 크기 계산을 위해 `get_memory_footprint`를 사용하는 것이 더 적합합니다.
+        int8_model_size = int8_model.get_memory_footprint() / 1024**2
        print(f"INT8 모델 크기: {int8_model_size:.2f} MB")

-        size_reduction = 100 * (1 - int8_model_size / fp32_model_size)
-        print(f"\n메모리 사용량 감소율: {size_reduction:.2f}%")
-
    except Exception as e:
        print(f"8-bit 모델 로드 중 오류 발생: {e}")
        print("bitsandbytes가 올바르게 설치되었는지, CUDA 환경이 맞는지 확인하세요.")
-        int8_model = None

-    # --- 3. 추론 파이프라인으로 결과 비교 ---
-    print("\n--- 3. 추론 결과 비교 ---")
+    # --- 3. 추론 파이프라인 생성 ---
+    print("\n--- 3. 추론 파이프라인 생성 ---")
    text = "This is a great movie, I really enjoyed it!"
    print(f"입력 텍스트: '{text}'")

-    # FP32 모델 추론
-    print("\nFP32 모델 추론 결과:")
    classifier_fp32 = pipeline(
        "text-classification",
        model=fp32_model,
        tokenizer=tokenizer,
        device=0 if device == "cuda" else -1,
    )
-    result_fp32 = classifier_fp32(text)
-    print(result_fp32)
+    print("FP32 파이프라인 생성 완료.")

-    # INT8 모델 추론
+    classifier_int8 = None
    if int8_model:
-        print("\nINT8 모델 추론 결과:")
-        # 양자화된 모델은 device_map을 통해 이미 GPU에 할당되어 있으므로 device=-1로 설정해도 GPU에서 실행됩니다.
+        # 양자화된 모델은 device_map을 통해 이미 GPU에 할당되어 있으므로 device 설정이 필요 없습니다.
        classifier_int8 = pipeline(
            "text-classification", model=int8_model, tokenizer=tokenizer
        )
+        print("INT8 파이프라인 생성 완료.")
+
+    # --- 4. 성능 벤치마킹 ---
+    print("\n--- 4. 성능 벤치마킹 ---")
+    fp32_latency, fp32_throughput = benchmark_performance(classifier_fp32, text)
+
+    int8_latency, int8_throughput = (0, 0)
+    if classifier_int8:
+        int8_latency, int8_throughput = benchmark_performance(classifier_int8, text)
+
+    # --- 5. 결과 요약 및 비교 ---
+    print("\n\n--- 5. 최종 성능 비교 ---")
+    print("=" * 50)
+    header = f"{'모델':<10} | {'모델 크기(MB)':<15} | {'평균 지연시간(ms)':<20} | {'처리량(inf/sec)':<15}"
+    print(header)
+    print("-" * len(header) * 2)
+
+    fp32_results = f"{'FP32':<10} | {fp32_model_size:<15.2f} | {fp32_latency:<20.2f} | {fp32_throughput:<15.2f}"
+    print(fp32_results)
+
+    if int8_model:
+        int8_results = f"{'INT8':<10} | {int8_model_size:<15.2f} | {int8_latency:<20.2f} | {int8_throughput:<15.2f}"
+        print(int8_results)
+
+        print("-" * len(header) * 2)
+        size_reduction = 100 * (1 - int8_model_size / fp32_model_size)
+        latency_improvement = (
+            100 * (fp32_latency - int8_latency) / fp32_latency if fp32_latency > 0 else 0
+        )
+        throughput_improvement = (
+            100 * (int8_throughput - fp32_throughput) / fp32_throughput
+            if fp32_throughput > 0
+            else 0
+        )
+        print(f"📊 모델 크기 감소율: {size_reduction:.2f}%")
+        print(f"⏱️ 지연 시간 개선율: {latency_improvement:.2f}%")
+        print(f"🚀 처리량 향상률: {throughput_improvement:.2f}%")
+
+    print("=" * 50)
+
+    # --- 6. 추론 결과 비교 ---
+    print("\n--- 6. 추론 결과 비교 ---")
+    print(f"입력 텍스트: '{text}'")
+
+    # FP32 모델 추론
+    print("\nFP32 모델 추론 결과:")
+    result_fp32 = classifier_fp32(text)
+    print(result_fp32)
+
+    # INT8 모델 추론
+    if classifier_int8:
+        print("\nINT8 모델 추론 결과:")
        result_int8 = classifier_int8(text)
        print(result_int8)

    print("\n실습 완료!")
    print(
-        "양자화를 통해 모델 크기가 크게 줄어들면서도 추론 결과는 유사하게 유지되는 것을 확인할 수 있습니다."
+        "양자화를 통해 모델 크기가 크게 줄어들고, 지연 시간이 감소하며 처리량이 향상되는 것을 확인할 수 있습니다."
    )