# ๐Ÿš€ AI ๊ต์œก ์ข…ํ•ฉ ์ •๋ฆฌ ๋ฌธ์„œ ## Python + NumPy + Pandas + Scikit-learn + Seaborn ๋งˆ์Šคํ„ฐ ๊ฐ€์ด๋“œ > **๐Ÿ“… ์ž‘์„ฑ์ผ**: 2024๋…„ > **๐ŸŽฏ ๋ชฉ์ **: 1๋‹ฌ๊ฐ„ ๊ณต๋ถ€๋ฅผ ์‰ฌ์–ด๋„ ๋‹ค์‹œ ๋ณด๋ฉด ์ดํ•ดํ•  ์ˆ˜ ์žˆ๋Š” ์ข…ํ•ฉ ์ •๋ฆฌ์„œ > **๐Ÿ‘ฅ ๋Œ€์ƒ**: AI ๊ต์œก ๊ณผ์ •์„ ์ˆ˜๋ฃŒํ•œ ์ง์›๋“ค --- ## ๐Ÿ“‹ ๋ชฉ์ฐจ 1. [Python ํ•ต์‹ฌ ๋ฌธ๋ฒ• ์š”์•ฝ](#1-python-ํ•ต์‹ฌ-๋ฌธ๋ฒ•-์š”์•ฝ) 2. [NumPy - ์ˆ˜์น˜ ๊ณ„์‚ฐ์˜ ๊ธฐ์ดˆ](#2-numpy---์ˆ˜์น˜-๊ณ„์‚ฐ์˜-๊ธฐ์ดˆ) 3. [Pandas - ๋ฐ์ดํ„ฐ ๋ถ„์„์˜ ํ•ต์‹ฌ](#3-pandas---๋ฐ์ดํ„ฐ-๋ถ„์„์˜-ํ•ต์‹ฌ) 4. [Scikit-learn - ๋จธ์‹ ๋Ÿฌ๋‹ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ](#4-scikit-learn---๋จธ์‹ ๋Ÿฌ๋‹-๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ) 5. [Seaborn - ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”](#5-seaborn---๋ฐ์ดํ„ฐ-์‹œ๊ฐํ™”) 6. [์‹ค์ „ ํ”„๋กœ์ ํŠธ ์˜ˆ์ œ](#6-์‹ค์ „-ํ”„๋กœ์ ํŠธ-์˜ˆ์ œ) 7. [์ž์ฃผ ์‚ฌ์šฉํ•˜๋Š” ์ฝ”๋“œ ํŒจํ„ด](#7-์ž์ฃผ-์‚ฌ์šฉํ•˜๋Š”-์ฝ”๋“œ-ํŒจํ„ด) 8. [๋ฌธ์ œ ํ•ด๊ฒฐ ๊ฐ€์ด๋“œ](#8-๋ฌธ์ œ-ํ•ด๊ฒฐ-๊ฐ€์ด๋“œ) --- ## 1. Python ํ•ต์‹ฌ ๋ฌธ๋ฒ• ์š”์•ฝ ### 1.1 ๊ธฐ๋ณธ ๋ฐ์ดํ„ฐ ํƒ€์ž…๊ณผ ๋ณ€์ˆ˜ ```python # ์ˆซ์ž age = 25 # ์ •์ˆ˜ height = 175.5 # ์‹ค์ˆ˜ complex_num = 3 + 4j # ๋ณต์†Œ์ˆ˜ # ๋ฌธ์ž์—ด name = "ํ™๊ธธ๋™" message = '์•ˆ๋…•ํ•˜์„ธ์š”' multi_line = """ ์—ฌ๋Ÿฌ ์ค„์˜ ๋ฌธ์ž์—ด์ž…๋‹ˆ๋‹ค """ # ๋ถˆ๋ฆฐ is_student = True is_working = False # None (๊ฐ’์ด ์—†์Œ) empty_value = None ``` ### 1.2 ์ปฌ๋ ‰์…˜ ๋ฐ์ดํ„ฐ ํƒ€์ž… ```python # ๋ฆฌ์ŠคํŠธ (์ˆ˜์ • ๊ฐ€๋Šฅ, ์ˆœ์„œ ์žˆ์Œ) fruits = ['์‚ฌ๊ณผ', '๋ฐ”๋‚˜๋‚˜', '์˜ค๋ Œ์ง€'] fruits.append('ํฌ๋„') # ์ถ”๊ฐ€ fruits[0] = '๋ฐฐ' # ์ˆ˜์ • first_fruit = fruits[0] # ์ ‘๊ทผ # ํŠœํ”Œ (์ˆ˜์ • ๋ถˆ๊ฐ€, ์ˆœ์„œ ์žˆ์Œ) coordinates = (10, 20) x, y = coordinates # ์–ธํŒจํ‚น # ๋”•์…”๋„ˆ๋ฆฌ (ํ‚ค-๊ฐ’ ์Œ) person = { 'name': '๊น€์ฒ ์ˆ˜', 'age': 30, 'city': '์„œ์šธ' } person['job'] = '๊ฐœ๋ฐœ์ž' # ์ถ”๊ฐ€ name = person['name'] # ์ ‘๊ทผ # ์ง‘ํ•ฉ (์ค‘๋ณต ์—†์Œ, ์ˆœ์„œ ์—†์Œ) unique_numbers = {1, 2, 3, 3, 4} # ๊ฒฐ๊ณผ: {1, 2, 3, 4} ``` ### 1.3 ์ œ์–ด๋ฌธ ```python # if-elif-else score = 85 if score >= 90: grade = 'A' elif score >= 80: grade = 'B' elif score >= 70: grade = 'C' else: grade = 'D' # for ๋ฐ˜๋ณต๋ฌธ for i in range(5): print(i) # 0, 1, 2, 3, 4 for fruit in fruits: print(f"๊ณผ์ผ: {fruit}") # while ๋ฐ˜๋ณต๋ฌธ count = 0 while count < 3: print(f"์นด์šดํŠธ: {count}") count += 1 # ๋ฆฌ์ŠคํŠธ ์ปดํ”„๋ฆฌํ—จ์…˜ squares = [x**2 for x in range(5)] # [0, 1, 4, 9, 16] even_squares = [x**2 for x in range(10) if x % 2 == 0] ``` ### 1.4 ํ•จ์ˆ˜ ```python # ๊ธฐ๋ณธ ํ•จ์ˆ˜ ์ •์˜ def greet(name, age=20): """์‚ฌ๋žŒ์—๊ฒŒ ์ธ์‚ฌํ•˜๋Š” ํ•จ์ˆ˜""" return f"์•ˆ๋…•ํ•˜์„ธ์š”, {name}๋‹˜! ๋‚˜์ด๋Š” {age}์„ธ์ž…๋‹ˆ๋‹ค." # ํ•จ์ˆ˜ ํ˜ธ์ถœ message = greet("๊น€์ฒ ์ˆ˜", 25) message2 = greet("์ด์˜ํฌ") # age๋Š” ๊ธฐ๋ณธ๊ฐ’ 20 ์‚ฌ์šฉ # ๋žŒ๋‹ค ํ•จ์ˆ˜ (๊ฐ„๋‹จํ•œ ํ•œ ์ค„ ํ•จ์ˆ˜) add = lambda x, y: x + y result = add(3, 5) # 8 # ์—ฌ๋Ÿฌ ๊ฐ’ ๋ฐ˜ํ™˜ def get_info(): return "ํ™๊ธธ๋™", 30, "์„œ์šธ" name, age, city = get_info() ``` ### 1.5 ํด๋ž˜์Šค์™€ ๊ฐ์ฒด์ง€ํ–ฅ ํ”„๋กœ๊ทธ๋ž˜๋ฐ ```python class Person: def __init__(self, name, age): self.name = name self.age = age def introduce(self): return f"์ €๋Š” {self.name}์ด๊ณ , {self.age}์‚ด์ž…๋‹ˆ๋‹ค." def have_birthday(self): self.age += 1 return f"{self.name}์˜ ์ƒ์ผ! ๋‚˜์ด๋Š” {self.age}์‚ด์ด ๋˜์—ˆ์Šต๋‹ˆ๋‹ค." # ๊ฐ์ฒด ์ƒ์„ฑ๊ณผ ์‚ฌ์šฉ person1 = Person("๊น€์ฒ ์ˆ˜", 25) print(person1.introduce()) # ์ €๋Š” ๊น€์ฒ ์ˆ˜์ด๊ณ , 25์‚ด์ž…๋‹ˆ๋‹ค. print(person1.have_birthday()) # ๊น€์ฒ ์ˆ˜์˜ ์ƒ์ผ! ๋‚˜์ด๋Š” 26์‚ด์ด ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ``` --- ## 2. NumPy - ์ˆ˜์น˜ ๊ณ„์‚ฐ์˜ ๊ธฐ์ดˆ ### 2.1 NumPy ๋ฐฐ์—ด ์ƒ์„ฑ๊ณผ ๊ธฐ๋ณธ ์—ฐ์‚ฐ ```python import numpy as np # ๋ฐฐ์—ด ์ƒ์„ฑ arr1 = np.array([1, 2, 3, 4, 5]) arr2 = np.array([[1, 2, 3], [4, 5, 6]]) # 2์ฐจ์› ๋ฐฐ์—ด # ํŠน๋ณ„ํ•œ ๋ฐฐ์—ด ์ƒ์„ฑ zeros = np.zeros((3, 4)) # 3x4 ์˜ํ–‰๋ ฌ ones = np.ones((2, 3)) # 2x3 ์ผํ–‰๋ ฌ range_arr = np.arange(0, 10, 2) # [0, 2, 4, 6, 8] linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1] # ๋ฐฐ์—ด ์†์„ฑ print(f"๋ฐฐ์—ด ๋ชจ์–‘: {arr2.shape}") # (2, 3) print(f"๋ฐฐ์—ด ์ฐจ์›: {arr2.ndim}") # 2 print(f"๋ฐฐ์—ด ํฌ๊ธฐ: {arr2.size}") # 6 print(f"๋ฐ์ดํ„ฐ ํƒ€์ž…: {arr2.dtype}") # int64 ``` ### 2.2 ๋ฐฐ์—ด ์ธ๋ฑ์‹ฑ๊ณผ ์Šฌ๋ผ์ด์‹ฑ ```python # 1์ฐจ์› ๋ฐฐ์—ด ์ธ๋ฑ์‹ฑ arr = np.array([10, 20, 30, 40, 50]) print(arr[0]) # 10 (์ฒซ ๋ฒˆ์งธ ์š”์†Œ) print(arr[-1]) # 50 (๋งˆ์ง€๋ง‰ ์š”์†Œ) print(arr[1:4]) # [20, 30, 40] (1๋ฒˆ๋ถ€ํ„ฐ 3๋ฒˆ๊นŒ์ง€) print(arr[::2]) # [10, 30, 50] (2์นธ์”ฉ ๊ฑด๋„ˆ๋›ฐ๊ธฐ) # 2์ฐจ์› ๋ฐฐ์—ด ์ธ๋ฑ์‹ฑ matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) print(matrix[0, 1]) # 2 (0ํ–‰ 1์—ด) print(matrix[1:3, :]) # [[4, 5, 6], [7, 8, 9]] (1~2ํ–‰, ๋ชจ๋“  ์—ด) print(matrix[:, 1]) # [2, 5, 8] (๋ชจ๋“  ํ–‰, 1์—ด) # ๋ถˆ๋ฆฐ ์ธ๋ฑ์‹ฑ mask = arr > 30 print(arr[mask]) # [40, 50] (30๋ณด๋‹ค ํฐ ๊ฐ’๋“ค) ``` ### 2.3 ๋ฐฐ์—ด ์—ฐ์‚ฐ ```python # ๊ธฐ๋ณธ ์‚ฐ์ˆ  ์—ฐ์‚ฐ a = np.array([1, 2, 3]) b = np.array([4, 5, 6]) print(a + b) # [5, 7, 9] (๋ง์…ˆ) print(a - b) # [-3, -3, -3] (๋บ„์…ˆ) print(a * b) # [4, 10, 18] (์š”์†Œ๋ณ„ ๊ณฑ์…ˆ) print(a / b) # [0.25, 0.4, 0.5] (์š”์†Œ๋ณ„ ๋‚˜๋ˆ—์…ˆ) print(a ** 2) # [1, 4, 9] (์ œ๊ณฑ) # ๋ธŒ๋กœ๋“œ์บ์ŠคํŒ… (์ž๋™์œผ๋กœ ํฌ๊ธฐ ๋งž์ถค) arr = np.array([1, 2, 3, 4]) print(arr + 10) # [11, 12, 13, 14] (๋ชจ๋“  ์š”์†Œ์— 10 ๋”ํ•˜๊ธฐ) print(arr * 2) # [2, 4, 6, 8] (๋ชจ๋“  ์š”์†Œ์— 2 ๊ณฑํ•˜๊ธฐ) ``` ### 2.4 ํ†ต๊ณ„ ํ•จ์ˆ˜ ```python data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) print(f"ํ‰๊ท : {np.mean(data)}") # 5.5 print(f"์ค‘์•™๊ฐ’: {np.median(data)}") # 5.5 print(f"ํ‘œ์ค€ํŽธ์ฐจ: {np.std(data)}") # 3.0276503540974917 print(f"๋ถ„์‚ฐ: {np.var(data)}") # 9.166666666666666 print(f"์ตœ์†Ÿ๊ฐ’: {np.min(data)}") # 1 print(f"์ตœ๋Œ“๊ฐ’: {np.max(data)}") # 10 print(f"ํ•ฉ๊ณ„: {np.sum(data)}") # 55 ``` ### 2.5 ์„ ํ˜•๋Œ€์ˆ˜ ์—ฐ์‚ฐ ```python # ํ–‰๋ ฌ ์ƒ์„ฑ A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]]) # ํ–‰๋ ฌ ๊ณฑ์…ˆ C = np.dot(A, B) # ๋˜๋Š” A @ B print("ํ–‰๋ ฌ ๊ณฑ์…ˆ:") print(C) # ํ–‰๋ ฌ ์ „์น˜ A_T = A.T print("์ „์น˜ ํ–‰๋ ฌ:") print(A_T) # ํ–‰๋ ฌ์‹ det_A = np.linalg.det(A) print(f"ํ–‰๋ ฌ์‹: {det_A}") # ์—ญํ–‰๋ ฌ A_inv = np.linalg.inv(A) print("์—ญํ–‰๋ ฌ:") print(A_inv) # ๊ณ ์œ ๊ฐ’๊ณผ ๊ณ ์œ ๋ฒกํ„ฐ eigenvalues, eigenvectors = np.linalg.eig(A) print(f"๊ณ ์œ ๊ฐ’: {eigenvalues}") print("๊ณ ์œ ๋ฒกํ„ฐ:") print(eigenvectors) ``` --- ## 3. Pandas - ๋ฐ์ดํ„ฐ ๋ถ„์„์˜ ํ•ต์‹ฌ ### 3.1 Series์™€ DataFrame ์ƒ์„ฑ ```python import pandas as pd # Series ์ƒ์„ฑ (1์ฐจ์› ๋ฐ์ดํ„ฐ) s1 = pd.Series([1, 3, 5, 7, 9]) s2 = pd.Series([1, 3, 5, 7, 9], index=['a', 'b', 'c', 'd', 'e']) # DataFrame ์ƒ์„ฑ (2์ฐจ์› ๋ฐ์ดํ„ฐ) data = { '์ด๋ฆ„': ['๊น€์ฒ ์ˆ˜', '์ด์˜ํฌ', '๋ฐ•๋ฏผ์ˆ˜', '์ตœ์ง€์˜'], '๋‚˜์ด': [25, 28, 22, 30], '์ง์—…': ['๊ฐœ๋ฐœ์ž', '๋””์ž์ด๋„ˆ', 'ํ•™์ƒ', '๋งˆ์ผ€ํ„ฐ'], '๊ธ‰์—ฌ': [3000, 3500, 0, 4000] } df = pd.DataFrame(data) # CSV ํŒŒ์ผ ์ฝ๊ธฐ df_from_csv = pd.read_csv('data.csv') df_from_excel = pd.read_excel('data.xlsx') ``` ### 3.2 ๋ฐ์ดํ„ฐ ํƒ์ƒ‰๊ณผ ๊ธฐ๋ณธ ์ •๋ณด ```python # ๊ธฐ๋ณธ ์ •๋ณด ํ™•์ธ print(df.info()) # ๋ฐ์ดํ„ฐ ํƒ€์ž…, ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ๋“ฑ print(df.describe()) # ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ํ†ต๊ณ„ ์š”์•ฝ print(df.head()) # ์ฒ˜์Œ 5ํ–‰ print(df.tail(3)) # ๋งˆ์ง€๋ง‰ 3ํ–‰ print(df.shape) # (ํ–‰ ์ˆ˜, ์—ด ์ˆ˜) print(df.columns) # ์—ด ์ด๋ฆ„๋“ค print(df.index) # ์ธ๋ฑ์Šค # ๋ฐ์ดํ„ฐ ํƒ€์ž… ํ™•์ธ print(df.dtypes) # ๊ฒฐ์ธก๊ฐ’ ํ™•์ธ print(df.isnull().sum()) # ๊ฐ ์—ด์˜ ๊ฒฐ์ธก๊ฐ’ ๊ฐœ์ˆ˜ ``` ### 3.3 ๋ฐ์ดํ„ฐ ์„ ํƒ๊ณผ ํ•„ํ„ฐ๋ง ```python # ์—ด ์„ ํƒ names = df['์ด๋ฆ„'] # ๋‹จ์ผ ์—ด subset = df[['์ด๋ฆ„', '๋‚˜์ด']] # ์—ฌ๋Ÿฌ ์—ด subset2 = df.loc[:, ['์ด๋ฆ„', '๊ธ‰์—ฌ']] # loc ์‚ฌ์šฉ # ํ–‰ ์„ ํƒ first_row = df.iloc[0] # ์ฒซ ๋ฒˆ์งธ ํ–‰ first_two = df.iloc[0:2] # ์ฒ˜์Œ 2ํ–‰ specific_rows = df.iloc[[0, 2]] # 0๋ฒˆ, 2๋ฒˆ ํ–‰ # ์กฐ๊ฑด๋ถ€ ํ•„ํ„ฐ๋ง young_people = df[df['๋‚˜์ด'] < 25] # 25์„ธ ๋ฏธ๋งŒ high_salary = df[df['๊ธ‰์—ฌ'] > 3000] # ๊ธ‰์—ฌ 3000 ์ดˆ๊ณผ developers = df[df['์ง์—…'] == '๊ฐœ๋ฐœ์ž'] # ์ง์—…์ด ๊ฐœ๋ฐœ์ž์ธ ์‚ฌ๋žŒ # ๋ณตํ•ฉ ์กฐ๊ฑด young_dev = df[(df['๋‚˜์ด'] < 30) & (df['์ง์—…'] == '๊ฐœ๋ฐœ์ž')] ``` ### 3.4 ๋ฐ์ดํ„ฐ ์ˆ˜์ •๊ณผ ์ถ”๊ฐ€ ```python # ์—ด ์ถ”๊ฐ€ df['๊ฒฝ๋ ฅ'] = [2, 5, 0, 8] df['๊ธ‰์—ฌ๋“ฑ๊ธ‰'] = df['๊ธ‰์—ฌ'].apply(lambda x: '๊ณ ๊ธ‰' if x > 3500 else '์ผ๋ฐ˜') # ์—ด ์ˆ˜์ • df.loc[df['์ง์—…'] == 'ํ•™์ƒ', '๊ธ‰์—ฌ'] = 1000 # ํ–‰ ์ถ”๊ฐ€ new_row = pd.DataFrame({'์ด๋ฆ„': ['์ •์ˆ˜๋ฏผ'], '๋‚˜์ด': [27], '์ง์—…': ['๊ธฐํš์ž'], '๊ธ‰์—ฌ': [3800]}) df = pd.concat([df, new_row], ignore_index=True) # ์—ด ์ด๋ฆ„ ๋ณ€๊ฒฝ df = df.rename(columns={'๊ธ‰์—ฌ': '์›”๊ธ‰', '์ง์—…': '์ง์ข…'}) ``` ### 3.5 ๋ฐ์ดํ„ฐ ๊ทธ๋ฃนํ™”์™€ ์ง‘๊ณ„ ```python # ๊ทธ๋ฃน๋ณ„ ํ†ต๊ณ„ job_stats = df.groupby('์ง์—…').agg({ '๋‚˜์ด': ['mean', 'min', 'max'], '๊ธ‰์—ฌ': ['mean', 'sum', 'count'] }) # ํ”ผ๋ฒ— ํ…Œ์ด๋ธ” pivot_table = df.pivot_table( values='๊ธ‰์—ฌ', index='์ง์—…', columns='๊ธ‰์—ฌ๋“ฑ๊ธ‰', aggfunc='mean', fill_value=0 ) # ํฌ๋กœ์Šคํƒญ cross_tab = pd.crosstab(df['์ง์—…'], df['๊ธ‰์—ฌ๋“ฑ๊ธ‰']) ``` ### 3.6 ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌ ```python # ๊ฒฐ์ธก๊ฐ’ ํ™•์ธ print(df.isnull().sum()) # ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•๋“ค df_drop = df.dropna() # ๊ฒฐ์ธก๊ฐ’์ด ์žˆ๋Š” ํ–‰ ์‚ญ์ œ df_fill = df.fillna(0) # 0์œผ๋กœ ์ฑ„์šฐ๊ธฐ df_fill_mean = df['๊ธ‰์—ฌ'].fillna(df['๊ธ‰์—ฌ'].mean()) # ํ‰๊ท ์œผ๋กœ ์ฑ„์šฐ๊ธฐ df_interpolate = df.interpolate() # ๋ณด๊ฐ„๋ฒ•์œผ๋กœ ์ฑ„์šฐ๊ธฐ ``` --- ## 4. Scikit-learn - ๋จธ์‹ ๋Ÿฌ๋‹ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ### 4.1 ๋จธ์‹ ๋Ÿฌ๋‹ ๊ธฐ๋ณธ ๊ฐœ๋… ```python from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.metrics import mean_squared_error, r2_score ``` ### 4.2 ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ```python # ๋ฐ์ดํ„ฐ ๋ถ„ํ•  (ํ›ˆ๋ จ/ํ…Œ์ŠคํŠธ) X = df[['๋‚˜์ด', '๊ฒฝ๋ ฅ']] # ํŠน์„ฑ y = df['๊ธ‰์—ฌ๋“ฑ๊ธ‰'] # ํƒ€๊ฒŸ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # ํŠน์„ฑ ์Šค์ผ€์ผ๋ง scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # ๋ฒ”์ฃผํ˜• ๋ฐ์ดํ„ฐ ์ธ์ฝ”๋”ฉ from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_train_encoded = le.fit_transform(y_train) y_test_encoded = le.transform(y_test) ``` ### 4.3 ์ง€๋„ํ•™์Šต - ๋ถ„๋ฅ˜ ```python from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # ๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ lr = LogisticRegression(random_state=42) lr.fit(X_train_scaled, y_train_encoded) lr_pred = lr.predict(X_test_scaled) # ๊ฒฐ์ • ํŠธ๋ฆฌ dt = DecisionTreeClassifier(random_state=42) dt.fit(X_train_scaled, y_train_encoded) dt_pred = dt.predict(X_test_scaled) # ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train_scaled, y_train_encoded) rf_pred = rf.predict(X_test_scaled) # ์„ฑ๋Šฅ ํ‰๊ฐ€ print(f"๋กœ์ง€์Šคํ‹ฑ ํšŒ๊ท€ ์ •ํ™•๋„: {accuracy_score(y_test_encoded, lr_pred):.3f}") print(f"๊ฒฐ์ • ํŠธ๋ฆฌ ์ •ํ™•๋„: {accuracy_score(y_test_encoded, dt_pred):.3f}") print(f"๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ์ •ํ™•๋„: {accuracy_score(y_test_encoded, rf_pred):.3f}") ``` ### 4.4 ์ง€๋„ํ•™์Šต - ํšŒ๊ท€ ```python from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor # ์„ ํ˜• ํšŒ๊ท€ lr_reg = LinearRegression() lr_reg.fit(X_train_scaled, y_train) lr_pred_reg = lr_reg.predict(X_test_scaled) # ๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ ํšŒ๊ท€ rf_reg = RandomForestRegressor(n_estimators=100, random_state=42) rf_reg.fit(X_train_scaled, y_train) rf_pred_reg = rf_reg.predict(X_test_scaled) # ์„ฑ๋Šฅ ํ‰๊ฐ€ print(f"์„ ํ˜• ํšŒ๊ท€ Rยฒ: {r2_score(y_test, lr_pred_reg):.3f}") print(f"๋žœ๋ค ํฌ๋ ˆ์ŠคํŠธ Rยฒ: {r2_score(y_test, rf_pred_reg):.3f}") print(f"์„ ํ˜• ํšŒ๊ท€ MSE: {mean_squared_error(y_test, lr_pred_reg):.3f}") ``` ### 4.5 ๋น„์ง€๋„ํ•™์Šต - ๊ตฐ์ง‘ํ™” ```python from sklearn.cluster import KMeans from sklearn.clustering import DBSCAN # K-means ๊ตฐ์ง‘ํ™” kmeans = KMeans(n_clusters=3, random_state=42) clusters = kmeans.fit_predict(X_scaled) # ๊ตฐ์ง‘ ๊ฒฐ๊ณผ ์‹œ๊ฐํ™” import matplotlib.pyplot as plt plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='viridis') plt.title('K-means ๊ตฐ์ง‘ํ™” ๊ฒฐ๊ณผ') plt.show() # ์ตœ์  ํด๋Ÿฌ์Šคํ„ฐ ์ˆ˜ ์ฐพ๊ธฐ (์—˜๋ณด์šฐ ๋ฉ”์„œ๋“œ) inertias = [] K_range = range(1, 11) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X_scaled) inertias.append(kmeans.inertia_) plt.plot(K_range, inertias, 'bx-') plt.xlabel('k') plt.ylabel('Inertia') plt.title('์—˜๋ณด์šฐ ๋ฉ”์„œ๋“œ') plt.show() ``` ### 4.6 ๋ชจ๋ธ ์„ฑ๋Šฅ ํ–ฅ์ƒ ```python from sklearn.model_selection import GridSearchCV, cross_val_score # ๊ต์ฐจ ๊ฒ€์ฆ cv_scores = cross_val_score(rf, X_scaled, y_encoded, cv=5) print(f"๊ต์ฐจ ๊ฒ€์ฆ ์ ์ˆ˜: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})") # ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹ param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10] } grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy') grid_search.fit(X_scaled, y_encoded) print(f"์ตœ์  ํŒŒ๋ผ๋ฏธํ„ฐ: {grid_search.best_params_}") print(f"์ตœ๊ณ  ์ ์ˆ˜: {grid_search.best_score_:.3f}") ``` --- ## 5. Seaborn - ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™” ### 5.1 ๊ธฐ๋ณธ ์‹œ๊ฐํ™” ```python import seaborn as sns import matplotlib.pyplot as plt # ์Šคํƒ€์ผ ์„ค์ • sns.set_style("whitegrid") plt.rcParams['font.family'] = 'Malgun Gothic' # ํ•œ๊ธ€ ํฐํŠธ ์„ค์ • # ํžˆ์Šคํ† ๊ทธ๋žจ plt.figure(figsize=(10, 6)) sns.histplot(data=df, x='๋‚˜์ด', bins=10, kde=True) plt.title('๋‚˜์ด ๋ถ„ํฌ') plt.show() # ๋ฐ•์Šคํ”Œ๋กฏ plt.figure(figsize=(10, 6)) sns.boxplot(data=df, x='์ง์—…', y='๊ธ‰์—ฌ') plt.title('์ง์—…๋ณ„ ๊ธ‰์—ฌ ๋ถ„ํฌ') plt.xticks(rotation=45) plt.show() ``` ### 5.2 ๊ด€๊ณ„ํ˜• ์‹œ๊ฐํ™” ```python # ์‚ฐ์ ๋„ plt.figure(figsize=(10, 6)) sns.scatterplot(data=df, x='๋‚˜์ด', y='๊ธ‰์—ฌ', hue='์ง์—…', size='๊ฒฝ๋ ฅ') plt.title('๋‚˜์ด์™€ ๊ธ‰์—ฌ์˜ ๊ด€๊ณ„') plt.show() # ํšŒ๊ท€์„ ์ด ์žˆ๋Š” ์‚ฐ์ ๋„ plt.figure(figsize=(10, 6)) sns.regplot(data=df, x='๋‚˜์ด', y='๊ธ‰์—ฌ', scatter_kws={'alpha':0.6}) plt.title('๋‚˜์ด์™€ ๊ธ‰์—ฌ์˜ ์„ ํ˜• ๊ด€๊ณ„') plt.show() # ์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต numeric_cols = df.select_dtypes(include=[np.number]) correlation_matrix = numeric_cols.corr() plt.figure(figsize=(8, 6)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) plt.title('์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต') plt.show() ``` ### 5.3 ๋ถ„ํฌ ์‹œ๊ฐํ™” ```python # ๋ถ„ํฌ ๋น„๊ต plt.figure(figsize=(12, 6)) sns.kdeplot(data=df, x='๊ธ‰์—ฌ', hue='์ง์—…', common_norm=False) plt.title('์ง์—…๋ณ„ ๊ธ‰์—ฌ ๋ถ„ํฌ') plt.show() # ๋ฐ”์ด์˜ฌ๋ฆฐ ํ”Œ๋กฏ plt.figure(figsize=(12, 6)) sns.violinplot(data=df, x='์ง์—…', y='๊ธ‰์—ฌ') plt.title('์ง์—…๋ณ„ ๊ธ‰์—ฌ ๋ถ„ํฌ (๋ฐ”์ด์˜ฌ๋ฆฐ ํ”Œ๋กฏ)') plt.xticks(rotation=45) plt.show() # ํŽ˜์–ดํ”Œ๋กฏ plt.figure(figsize=(12, 12)) sns.pairplot(df, hue='์ง์—…', diag_kind='kde') plt.show() ``` ### 5.4 ์นดํ…Œ๊ณ ๋ฆฌํ˜• ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™” ```python # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ plt.figure(figsize=(10, 6)) sns.countplot(data=df, x='์ง์—…') plt.title('์ง์—…๋ณ„ ์ธ์› ์ˆ˜') plt.xticks(rotation=45) plt.show() # ๊ทธ๋ฃน๋ณ„ ํ†ต๊ณ„ plt.figure(figsize=(12, 6)) sns.barplot(data=df, x='์ง์—…', y='๊ธ‰์—ฌ', ci=95) plt.title('์ง์—…๋ณ„ ํ‰๊ท  ๊ธ‰์—ฌ (95% ์‹ ๋ขฐ๊ตฌ๊ฐ„)') plt.xticks(rotation=45) plt.show() ``` --- ## 6. ์‹ค์ „ ํ”„๋กœ์ ํŠธ ์˜ˆ์ œ ### 6.1 ๊ฐ„๋‹จํ•œ ๋จธ์‹ ๋Ÿฌ๋‹ ํŒŒ์ดํ”„๋ผ์ธ ```python import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report import seaborn as sns import matplotlib.pyplot as plt # 1. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ํƒ์ƒ‰ def load_and_explore_data(file_path): """๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•˜๊ณ  ๊ธฐ๋ณธ ์ •๋ณด๋ฅผ ํƒ์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.""" df = pd.read_csv(file_path) print("=== ๋ฐ์ดํ„ฐ ๊ธฐ๋ณธ ์ •๋ณด ===") print(f"๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {df.shape}") print(f"์ปฌ๋Ÿผ: {list(df.columns)}") print("\n=== ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ===") print(df.head()) print("\n=== ๋ฐ์ดํ„ฐ ํƒ€์ž… ===") print(df.dtypes) print("\n=== ๊ฒฐ์ธก๊ฐ’ ===") print(df.isnull().sum()) print("\n=== ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ํ†ต๊ณ„ ===") print(df.describe()) return df # 2. ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ def preprocess_data(df, target_column): """๋ฐ์ดํ„ฐ๋ฅผ ์ „์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" # ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌ df_clean = df.dropna() # ํŠน์„ฑ๊ณผ ํƒ€๊ฒŸ ๋ถ„๋ฆฌ X = df_clean.drop(target_column, axis=1) y = df_clean[target_column] # ์ˆ˜์น˜ํ˜• ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ numeric_columns = X.select_dtypes(include=[np.number]).columns X_numeric = X[numeric_columns] # ๋ฐ์ดํ„ฐ ๋ถ„ํ•  X_train, X_test, y_train, y_test = train_test_split( X_numeric, y, test_size=0.2, random_state=42 ) # ์Šค์ผ€์ผ๋ง scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) return X_train_scaled, X_test_scaled, y_train, y_test, scaler # 3. ๋ชจ๋ธ ํ›ˆ๋ จ ๋ฐ ํ‰๊ฐ€ def train_and_evaluate_model(X_train, X_test, y_train, y_test): """๋ชจ๋ธ์„ ํ›ˆ๋ จํ•˜๊ณ  ํ‰๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.""" # ๋ชจ๋ธ ํ›ˆ๋ จ model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) # ์˜ˆ์ธก y_pred = model.predict(X_test) # ์„ฑ๋Šฅ ํ‰๊ฐ€ print("=== ๋ถ„๋ฅ˜ ๋ณด๊ณ ์„œ ===") print(classification_report(y_test, y_pred)) return model, y_pred # 4. ๊ฒฐ๊ณผ ์‹œ๊ฐํ™” def visualize_results(df, target_column, model, X_test_scaled, y_test, y_pred): """๊ฒฐ๊ณผ๋ฅผ ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค.""" # ํŠน์„ฑ ์ค‘์š”๋„ feature_importance = pd.DataFrame({ 'feature': df.drop(target_column, axis=1).select_dtypes(include=[np.number]).columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) plt.figure(figsize=(10, 6)) sns.barplot(data=feature_importance, x='importance', y='feature') plt.title('ํŠน์„ฑ ์ค‘์š”๋„') plt.show() # ์˜ˆ์ธก vs ์‹ค์ œ ๋น„๊ต plt.figure(figsize=(8, 6)) plt.scatter(range(len(y_test)), y_test, alpha=0.7, label='์‹ค์ œ๊ฐ’') plt.scatter(range(len(y_pred)), y_pred, alpha=0.7, label='์˜ˆ์ธก๊ฐ’') plt.xlabel('์ƒ˜ํ”Œ') plt.ylabel('๊ฐ’') plt.title('์˜ˆ์ธก๊ฐ’ vs ์‹ค์ œ๊ฐ’ ๋น„๊ต') plt.legend() plt.show() # ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜ def main(): # ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ (์‹ค์ œ๋กœ๋Š” ํŒŒ์ผ์—์„œ ๋กœ๋“œ) np.random.seed(42) n_samples = 1000 # ๊ฐ€์ƒ์˜ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ data = { 'feature1': np.random.normal(0, 1, n_samples), 'feature2': np.random.normal(0, 1, n_samples), 'feature3': np.random.normal(0, 1, n_samples), 'target': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]) } df = pd.DataFrame(data) # 1. ๋ฐ์ดํ„ฐ ํƒ์ƒ‰ df = load_and_explore_data(df) # 2. ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data( df, 'target' ) # 3. ๋ชจ๋ธ ํ›ˆ๋ จ ๋ฐ ํ‰๊ฐ€ model, y_pred = train_and_evaluate_model( X_train_scaled, X_test_scaled, y_train, y_test ) # 4. ๊ฒฐ๊ณผ ์‹œ๊ฐํ™” visualize_results(df, 'target', model, X_test_scaled, y_test, y_pred) print("=== ํ”„๋กœ์ ํŠธ ์™„๋ฃŒ! ===") if __name__ == "__main__": main() ``` --- ## 7. ์ž์ฃผ ์‚ฌ์šฉํ•˜๋Š” ์ฝ”๋“œ ํŒจํ„ด ### 7.1 ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ์ €์žฅ ```python # ๋‹ค์–‘ํ•œ ํ˜•์‹์˜ ํŒŒ์ผ ์ฝ๊ธฐ df_csv = pd.read_csv('data.csv', encoding='utf-8') df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1') df_json = pd.read_json('data.json') # ๋ฐ์ดํ„ฐ ์ €์žฅ df.to_csv('output.csv', index=False, encoding='utf-8') df.to_excel('output.xlsx', index=False) df.to_json('output.json', orient='records') ``` ### 7.2 ๋ฐ์ดํ„ฐ ์ •์ œ ```python # ์ค‘๋ณต ์ œ๊ฑฐ df_clean = df.drop_duplicates() # ์ปฌ๋Ÿผ๋ช… ์ •๋ฆฌ df.columns = df.columns.str.lower().str.replace(' ', '_') # ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜ df['๋‚ ์งœ'] = pd.to_datetime(df['๋‚ ์งœ']) df['์นดํ…Œ๊ณ ๋ฆฌ'] = df['์นดํ…Œ๊ณ ๋ฆฌ'].astype('category') # ์กฐ๊ฑด๋ถ€ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜ df['๊ธ‰์—ฌ๋“ฑ๊ธ‰'] = np.where(df['๊ธ‰์—ฌ'] > 3500, '๊ณ ๊ธ‰', '์ผ๋ฐ˜') ``` ### 7.3 ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ```python # ๋‚ ์งœ ์ธ๋ฑ์Šค ์„ค์ • df['๋‚ ์งœ'] = pd.to_datetime(df['๋‚ ์งœ']) df.set_index('๋‚ ์งœ', inplace=True) # ์‹œ๊ฐ„๋ณ„ ๊ทธ๋ฃนํ™” monthly_data = df.resample('M').mean() daily_data = df.resample('D').sum() # ์ด๋™ํ‰๊ท  df['์ด๋™ํ‰๊ท _7์ผ'] = df['๊ฐ’'].rolling(window=7).mean() ``` ### 7.4 ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ```python from multiprocessing import Pool import pandas as pd def process_chunk(chunk): """๋ฐ์ดํ„ฐ ์ฒญํฌ๋ฅผ ์ฒ˜๋ฆฌํ•˜๋Š” ํ•จ์ˆ˜""" return chunk.apply(some_processing_function) def parallel_process_data(df, n_processes=4): """๋ฐ์ดํ„ฐ๋ฅผ ๋ณ‘๋ ฌ๋กœ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.""" # ๋ฐ์ดํ„ฐ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ•  chunk_size = len(df) // n_processes chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)] # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ with Pool(n_processes) as pool: results = pool.map(process_chunk, chunks) # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ return pd.concat(results, ignore_index=True) ``` --- ## 8. ๋ฌธ์ œ ํ•ด๊ฒฐ ๊ฐ€์ด๋“œ ### 8.1 ์ž์ฃผ ๋ฐœ์ƒํ•˜๋Š” ์˜ค๋ฅ˜์™€ ํ•ด๊ฒฐ๋ฐฉ๋ฒ• #### 8.1.1 ์ธ๋ฑ์‹ฑ ์˜ค๋ฅ˜ ```python # ์˜ค๋ฅ˜: KeyError: 'column_name' # ํ•ด๊ฒฐ: ์ปฌ๋Ÿผ ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ if 'column_name' in df.columns: result = df['column_name'] else: print("์ปฌ๋Ÿผ์ด ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.") # ์˜ค๋ฅ˜: IndexError: single positional indexer is out-of-bounds # ํ•ด๊ฒฐ: ์ธ๋ฑ์Šค ๋ฒ”์œ„ ํ™•์ธ if len(df) > 0: first_row = df.iloc[0] else: print("๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ด ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.") ``` #### 8.1.2 ๋ฐ์ดํ„ฐ ํƒ€์ž… ์˜ค๋ฅ˜ ```python # ์˜ค๋ฅ˜: TypeError: can't multiply sequence by non-int of type 'float' # ํ•ด๊ฒฐ: ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜ df['์ˆ˜์น˜์ปฌ๋Ÿผ'] = pd.to_numeric(df['์ˆ˜์น˜์ปฌ๋Ÿผ'], errors='coerce') # ์˜ค๋ฅ˜: ValueError: cannot convert float NaN to integer # ํ•ด๊ฒฐ: ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌ ํ›„ ๋ณ€ํ™˜ df['์ •์ˆ˜์ปฌ๋Ÿผ'] = df['์ •์ˆ˜์ปฌ๋Ÿผ'].fillna(0).astype(int) ``` #### 8.1.3 ๋ฉ”๋ชจ๋ฆฌ ์˜ค๋ฅ˜ ```python # ๋Œ€์šฉ๋Ÿ‰ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ # 1. ๋ฐ์ดํ„ฐ ํƒ€์ž… ์ตœ์ ํ™” df['์ž‘์€์ •์ˆ˜'] = df['์ž‘์€์ •์ˆ˜'].astype('int8') df['์ž‘์€์‹ค์ˆ˜'] = df['์ž‘์€์‹ค์ˆ˜'].astype('float32') # 2. ์ฒญํฌ ๋‹จ์œ„ ์ฒ˜๋ฆฌ chunk_size = 10000 for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size): process_chunk(chunk) ``` ### 8.2 ์„ฑ๋Šฅ ์ตœ์ ํ™” ํŒ ```python # 1. ๋ฒกํ„ฐํ™” ์—ฐ์‚ฐ ์‚ฌ์šฉ (๋ฐ˜๋ณต๋ฌธ ๋Œ€์‹ ) # ๋А๋ฆฐ ๋ฐฉ๋ฒ• for i in range(len(df)): df.loc[i, '์ƒˆ์ปฌ๋Ÿผ'] = df.loc[i, '์ปฌ๋Ÿผ1'] + df.loc[i, '์ปฌ๋Ÿผ2'] # ๋น ๋ฅธ ๋ฐฉ๋ฒ• df['์ƒˆ์ปฌ๋Ÿผ'] = df['์ปฌ๋Ÿผ1'] + df['์ปฌ๋Ÿผ2'] # 2. ์ ์ ˆํ•œ ๋ฐ์ดํ„ฐ ํƒ€์ž… ์‚ฌ์šฉ df['์นดํ…Œ๊ณ ๋ฆฌ'] = df['์นดํ…Œ๊ณ ๋ฆฌ'].astype('category') # ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ # 3. ๋ถˆํ•„์š”ํ•œ ์ปฌ๋Ÿผ ์ œ๊ฑฐ df = df.drop(['์‚ฌ์šฉํ•˜์ง€์•Š๋Š”์ปฌ๋Ÿผ1', '์‚ฌ์šฉํ•˜์ง€์•Š๋Š”์ปฌ๋Ÿผ2'], axis=1) # 4. ์ธ๋ฑ์Šค ์ตœ์ ํ™” df.set_index('์ž์ฃผ์‚ฌ์šฉํ•˜๋Š”์ปฌ๋Ÿผ', inplace=True) ``` ### 8.3 ๋””๋ฒ„๊น… ํŒ ```python # 1. ๋ฐ์ดํ„ฐ ์ƒํƒœ ํ™•์ธ print(f"๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ํฌ๊ธฐ: {df.shape}") print(f"๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") # 2. ์ค‘๊ฐ„ ๊ฒฐ๊ณผ ํ™•์ธ print("์ „์ฒ˜๋ฆฌ ํ›„ ๋ฐ์ดํ„ฐ:") print(df.head()) print(f"๊ฒฐ์ธก๊ฐ’: {df.isnull().sum()}") # 3. ์˜ˆ์™ธ ์ฒ˜๋ฆฌ try: result = some_function(df) except Exception as e: print(f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}") print(f"๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ •๋ณด: {df.info()}") raise ``` --- ## ๐Ÿ“š ์ถ”๊ฐ€ ํ•™์Šต ์ž๋ฃŒ ### ์˜จ๋ผ์ธ ๋ฆฌ์†Œ์Šค - **Python ๊ณต์‹ ๋ฌธ์„œ**: https://docs.python.org/ - **NumPy ๊ณต์‹ ๋ฌธ์„œ**: https://numpy.org/doc/ - **Pandas ๊ณต์‹ ๋ฌธ์„œ**: https://pandas.pydata.org/docs/ - **Scikit-learn ๊ณต์‹ ๋ฌธ์„œ**: https://scikit-learn.org/stable/ - **Seaborn ๊ณต์‹ ๋ฌธ์„œ**: https://seaborn.pydata.org/ ### ์ถ”์ฒœ ๋„์„œ - "ํŒŒ์ด์ฌ ๋ฐ์ดํ„ฐ ์‚ฌ์ด์–ธ์Šค ํ•ธ๋“œ๋ถ" - Jake VanderPlas - "ํŒŒ์ด์ฌ ๋จธ์‹ ๋Ÿฌ๋‹ ์™„๋ฒฝ ๊ฐ€์ด๋“œ" - ๊ถŒ์ฒ ๋ฏผ - "๋ฐ์ดํ„ฐ ๋ถ„์„์„ ์œ„ํ•œ ํŒŒ์ด์ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ" - Wes McKinney ### ์‹ค์Šต ํ”„๋กœ์ ํŠธ ์•„์ด๋””์–ด 1. **๋ถ€๋™์‚ฐ ๊ฐ€๊ฒฉ ์˜ˆ์ธก**: ์ง€์—ญ, ๋ฉด์ , ์—ฐ๋„ ๋“ฑ์œผ๋กœ ๊ฐ€๊ฒฉ ์˜ˆ์ธก 2. **๊ณ ๊ฐ ์ดํƒˆ ์˜ˆ์ธก**: ๊ณ ๊ฐ ํ–‰๋™ ๋ฐ์ดํ„ฐ๋กœ ์ดํƒˆ ๊ฐ€๋Šฅ์„ฑ ์˜ˆ์ธก 3. **์ œํ’ˆ ์ถ”์ฒœ ์‹œ์Šคํ…œ**: ์‚ฌ์šฉ์ž ํ–‰๋™ ๋ฐ์ดํ„ฐ๋กœ ์ œํ’ˆ ์ถ”์ฒœ 4. **๊ฐ์ • ๋ถ„์„**: ํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ๋กœ ๊ฐ์ • ๋ถ„๋ฅ˜ 5. **์ด์ƒ์น˜ ํƒ์ง€**: ๊ธˆ์œต ๊ฑฐ๋ž˜ ๋ฐ์ดํ„ฐ์—์„œ ์ด์ƒ ๊ฑฐ๋ž˜ ํƒ์ง€ --- ## ๐ŸŽฏ ๋งˆ๋ฌด๋ฆฌ ์ด ๋ฌธ์„œ๋Š” ์ง€๊ธˆ๊นŒ์ง€ ๋ฐฐ์šด AI ๊ต์œก ๋‚ด์šฉ์˜ ํ•ต์‹ฌ์„ ์ •๋ฆฌํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค. 1๋‹ฌ ํ›„์—๋„ ์ด ๋ฌธ์„œ๋ฅผ ๋ณด๋ฉด ์ฃผ์š” ๊ฐœ๋…๊ณผ ์ฝ”๋“œ ํŒจํ„ด์„ ์‰ฝ๊ฒŒ ๋– ์˜ฌ๋ฆด ์ˆ˜ ์žˆ์„ ๊ฒƒ์ž…๋‹ˆ๋‹ค. **๐Ÿ’ก ๊ธฐ์–ตํ•˜์„ธ์š”:** - **Python**: ํ”„๋กœ๊ทธ๋ž˜๋ฐ์˜ ๊ธฐ์ดˆ, ๋ชจ๋“  ๊ฒƒ์˜ ์‹œ์ž‘ - **NumPy**: ์ˆ˜์น˜ ๊ณ„์‚ฐ์˜ ํ•ต์‹ฌ, ์„ ํ˜•๋Œ€์ˆ˜์˜ ๊ธฐ์ดˆ - **Pandas**: ๋ฐ์ดํ„ฐ ๋ถ„์„์˜ ๋„๊ตฌ, ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ์˜ ํ•„์ˆ˜ - **Scikit-learn**: ๋จธ์‹ ๋Ÿฌ๋‹์˜ ์‹ค์ „, ๋ชจ๋ธ๋ง์˜ ํ•ต์‹ฌ - **Seaborn**: ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”์˜ ์˜ˆ์ˆ , ์ธ์‚ฌ์ดํŠธ ๋ฐœ๊ฒฌ์˜ ๋„๊ตฌ **๐Ÿš€ ๋‹ค์Œ ๋‹จ๊ณ„:** - ์ด ๋ฌธ์„œ์˜ ์˜ˆ์ œ๋“ค์„ ์ง์ ‘ ์‹คํ–‰ํ•ด๋ณด์„ธ์š” - ์ž์‹ ๋งŒ์˜ ํ”„๋กœ์ ํŠธ๋ฅผ ์‹œ์ž‘ํ•ด๋ณด์„ธ์š” - ์ƒˆ๋กœ์šด ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋‚˜ ๊ธฐ๋ฒ•์„ ํƒํ—˜ํ•ด๋ณด์„ธ์š” **๐Ÿ’ช ๊พธ์ค€ํ•œ ์—ฐ์Šต์ด ์‹ค๋ ฅ์„ ๋งŒ๋“ญ๋‹ˆ๋‹ค!** --- *์ด ๋ฌธ์„œ๋Š” AI ๊ต์œก ๊ณผ์ •์˜ ์ข…ํ•ฉ ์ •๋ฆฌ์„œ์ž…๋‹ˆ๋‹ค. ๊ถ๊ธˆํ•œ ์ ์ด๋‚˜ ์ถ”๊ฐ€๋กœ ํ•„์š”ํ•œ ๋‚ด์šฉ์ด ์žˆ์œผ๋ฉด ์–ธ์ œ๋“  ๋ฌธ์˜ํ•ด์ฃผ์„ธ์š”.*