# 모듈 import 하기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import randn
plt.rc('font', family='malgun gothic')  # 한글 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False  # 음수 깨짐 방지


x = ['서울', '인천', '수원']  # list, tuple 기능
y = [5, 3 ,7]
plt.xlim([-1, 3])  # 축의 경계값
plt.ylim([0, 10])
plt.yticks(list(range(0,11,3)))  # 축 값 지정
plt.plot(x, y)
plt.show()


data = np.arange(1, 11, 2)
x = [0,1,2,3,4]
for a, b in zip(x, data):
    plt.text(a, b, str(b))

plt.plot(data)
plt.plot(data, data, color=(1.0,0.0,0.0))

for a, b in zip(data, data):
    plt.text(a, b, str(b))
    
plt.show()


x = np.arange(10)
y = np.sin(x)
plt.plot(x, y, 'go:')  # :은 점선으로 만듬
plt.plot(x, y, c='g', marker='o', ms=15, ls='--', lw=5)  # ms는 marker size, ls는 line style, lw는 line width
plt.show()


# hold : 하나의 Figure 내에 복수 plot
x = np.arange(0, np.pi * 3, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)

plt.figure(figsize=(10, 5))
plt.plot(x, y_sin, 'r')
plt.scatter(x, y_cos)

plt.xlabel('x축')
plt.ylabel('y축')
plt.title('사인 코사인 그래프')
plt.legend(['사인', '코사인'])  # 태그 달기

plt.show()


plt.subplot(2, 1, 1) #2행 1열이고 현재나는 1행(활성화 순서)에 있다.
plt.plot(x, y_sin)
plt.title('Sine')
plt.subplot(2, 1, 2) #2행 1열이고 현재 나는 2행(활성화 순서)에 있다.
plt.plot(x, y_cos)
plt.title('Cosine')
plt.show()


# fig = plt.gcf()  # 차트를 이미지로 저장 준비
# plt.show()
# fig.savefig('test.png')


fig = plt.figure() # 일시적으로 차트명이 객체 선언
ax1 = fig.add_subplot(1,2,1) #1행 2열 1번쨰에 포커스
ax2 = fig.add_subplot(1,2,2) #1행 2열 2번째에 포커스

ax1.hist(randn(10)) #히스토그램
ax2.plot(randn(10)) #꺽은선
plt.show()


data = [50, 80, 100, -70, 90]
plt.bar(range(len(data)), data)  # bar은 bar 차트
plt.show()

data = [50, 80, 100, 70, 90]
error = randn(len(data))
plt.barh(range(len(data)), data, alpha=0.3, xerr=error)  # barh 은 가로 방향 그래프이다.
plt.show()


# pie 차트
data = [50, 80, 100, 70, 90]
plt.pie(data, explode=(0,0.1,0,0.3,0), colors=['yellow', 'blue', 'red'])
plt.show()


# boxplot
plt.boxplot(data)
plt.show()


# 산점도
n = 30
np.random.seed(0)
x = np.random.rand(n)
y = np.random.rand(n)
color = np.random.rand(n)
scale = np.pi * (15 * np.random.rand(n)) ** 2
plt.scatter(x, y, s=scale, c=color)
plt.show()


# 여러데이터를 묶어서 표현하기
fdata = pd.DataFrame(np.random.randn(1000, 4), index = pd.date_range('1/1/2000', periods=1000),
                    columns = list('abcd'))
fdata = fdata.cumsum()
print(fdata)
plt.plot(fdata)
plt.show()

                    a         b          c          d
2000-01-01   0.010500  1.785870   0.126912   0.401989
2000-01-02   1.893651  0.438111  -1.143573   1.371386
2000-01-03   0.720527  2.381733  -1.557192   0.623931
2000-01-04   2.643469  3.862247   0.310367   1.529976
2000-01-05   1.782244  5.772312   0.042364   2.332432
...               ...       ...        ...        ...
2002-09-22 -68.100051 -1.757694 -28.022355 -11.203922
2002-09-23 -68.872649 -2.041320 -30.349959 -13.649150
2002-09-24 -69.588514 -1.157923 -31.650399 -13.725489
2002-09-25 -68.157957 -2.481331 -32.088755 -14.468641
2002-09-26 -67.265990 -2.017459 -31.471094 -11.972224

[1000 rows x 4 columns]


iris_data = pd.read_csv("https://raw.githubusercontent.com/pykwon/python/master/testdata_utf8/iris.csv")
print(iris_data.head(3))

   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa


# 산점도로 표현 : Sepal.Length 가 독립변수, Petal.Length가 종속변수
plt.scatter(iris_data['Sepal.Length'], iris_data['Petal.Length'])
plt.xlabel('Sepal.Length')
plt.ylabel('Petal.Length')
plt.title('iris data')
plt.show()


# setosa, versicolor, virginica 색깔을 다르게 줘보기 
cols = []
for s in iris_data['Species']:
    choice = 0
    if s == 'setosa': choice = 1
    if s == 'versicolor': choice = 2
    if s == 'viginica': choice = 3
    cols.append(choice)
    
plt.scatter(iris_data['Sepal.Length'], iris_data['Petal.Length'], c = cols)
plt.xlabel('Sepal.Length')
plt.ylabel('Petal.Length')
plt.title('iris data')
plt.show()


from pandas.plotting import scatter_matrix
iris_col = iris_data.loc[:, 'Sepal.Length':'Petal.Width']
scatter_matrix(iris_col, diagonal='kde')  # hist, bar ...
plt.show()


# seaborn 사용
sns.pairplot(iris_data, hue="Species")
plt.show()


# pandas의 plot기능
import numpy as np
np.random.seed(0)
df = pd.DataFrame(np.random.randn(10,3),
                 index = pd.date_range('1/1/2000', periods=10), columns = ['a', 'b', 'c'])
df.plot(kind = 'bar', rot = 45)  # rot = 45 는 xlabel 각을 45도 돌림
plt.xlabel('time')
plt.ylabel('data')
plt.show()


# count가 그날의 자전거 대여량이다
train = pd.read_csv("https://raw.githubusercontent.com/pykwon/python/master/data/train.csv", parse_dates=['datetime'])
print(train.head())
print(train.columns)

             datetime  season  holiday  workingday  weather  temp   atemp  \
0 2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1 2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2 2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3 2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4 2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  casual  registered  count  
0        81        0.0       3          13     16  
1        80        0.0       8          32     40  
2        80        0.0       5          27     32  
3        75        0.0       3          10     13  
4        75        0.0       0           1      1  
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')


# 기초 통계값
print(train.temp.describe())

count    10886.00000
mean        20.23086
std          7.79159
min          0.82000
25%         13.94000
50%         20.50000
75%         26.24000
max         41.00000
Name: temp, dtype: float64


# 연, 월, 일, 시간 칼럼을 추가하기 (각 단위별 데이터 통계내기 위해)
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second
print(train.columns)
print(train.head(2))

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'minute', 'second'],
      dtype='object')
             datetime  season  holiday  workingday  weather  temp   atemp  \
0 2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1 2011-01-01 01:00:00       1        0           0        1  9.02  13.635   

   humidity  windspeed  casual  registered  count  year  month  day  hour  \
0        81        0.0       3          13     16  2011      1    1     0   
1        80        0.0       8          32     40  2011      1    1     1   

   minute  second  
0       0       0  
1       0       0


# 대여랑 시각화
figure, (ax1, ax2, ax3, ax4) = plt.subplots(nrows = 1, ncols = 4)
figure.set_size_inches(15, 5)
sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)
sns.barplot(data=train, x='day', y='count', ax=ax3)
sns.barplot(data=train, x='hour', y='count', ax=ax4)
ax1.set(ylabel = 'Count', title='연도별 대여량')
ax2.set(xlabel = 'month', title='월별 대여량')
ax3.set(xlabel = 'day', title='일별 대여량')
ax4.set(xlabel = 'hour', title='시간별 대여량')
plt.show()


# 여러가지 기준으로 분류해 시각화 해보기
figure, axes = plt.subplots(nrows = 2, ncols = 2)
figure.set_size_inches(12, 10)
sns.boxplot(data=train, y='count', orient='v', ax=axes[0][0])
sns.boxplot(data=train, y='count', x='season', orient='v', ax=axes[0][1])
sns.boxplot(data=train, y='count', x='hour', orient='v', ax=axes[1][0])
sns.boxplot(data=train, y='count', x='workingday', orient='v', ax=axes[1][1])
axes[0][0].set(ylabel = 'Count', title='대여량')
axes[0][1].set(xlabel = 'season', title='계절별 대여량')
axes[1][0].set(xlabel = 'hour', title='시간별 대여량')
axes[1][1].set(xlabel = 'workingday', title='근무일 여부에 따른 대여량')
plt.show()


# regplot(temp, humidity, windspeed)
figure, (ax1, ax2, ax3) = plt.subplots(ncols = 3)
figure.set_size_inches(10, 5)
sns.regplot(data=train, x='temp', y='count', ax=ax1)
sns.regplot(data=train, x='humidity', y='count', ax=ax2)
sns.regplot(data=train, x='windspeed', y='count', ax=ax3)
ax1.set(ylabel = 'Count', title='온도별 대여량')
ax2.set(xlabel = 'humidity', title='습도별 대여량')
ax3.set(xlabel = 'windspeed', title='풍속별 대여량')
plt.show()

Visualization [KOR]

시각화¶

선 그래프¶

다양한 종류의 그래프 - 적절한 그래프 선정이 중요¶

iris 데이터 셋으로 시각화 해보기 - matplotlib, seaborn 사용¶

자전거 대여 정보를 이용해 시각화¶