python pandas

July 27, 2019 4 minute read

Pandas
- easy to visualize
- easy to present
- easy to manipulate data as SQL
- High-performance containers for data analysis

Store Data in DataFrame

method 1

 import pandas as pd
 >>> df = pd.DataFrame({})
 >>> df = pd.DataFrame({'name': ['yuting', 'jako'], 'births':[987, 456]})
 >>> df
      name  births
 0  yuting     987
 1    jako     456

method2

 >>> names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel', 'Jim']
 >>> births = [968, 155, 77, 578, 973, 968 ]
 >>> BabyDataSet = list(zip(names, births))
 >>> df = pd.DataFrame(BabyDataSet, columns = ['names', 'births'])
 >>> df
      names  births
 0      Bob     968
 1  Jessica     155
 2     Mary      77
 3     John     578
 4      Mel     973
 5      Jim     968

Work with CSV

check point of CSV file
- header
- row index
- encoding

 import pandas as pd
 df = pd.read_csv('xxx.csv', )
 df = pd.read_csv('xxx.csv', names=['xxxx', 'xxxx'] ) #  加上　header

 # pandas 把 csv file　第一row 當header
 >>> df = pd.read_csv('test.csv', )
 >>> df
                                                  WN  07/01/2015    0103  N8607M  MDW  ... 0000.1 0000.2  0000.3  0000.4  Unnamed: 17
 0                                                WN  07/01/2015   144.0  N8609A  SAN  ...    0.0    0.0     0.0     0.0          NaN
 1                                                WN  07/01/2015   178.0  N646SW  ELP  ...    0.0    0.0     0.0     0.0          NaN
 2                                                WN  07/01/2015   232.0  N204WN  ATL  ...    0.0    0.0     0.0     0.0          NaN
   
 # csv file 沒有　header 的時候 處理方法
 >>> df = pd.read_csv('test.csv', header=None)
 >>> df
                                                  0           1       2       3    4      5   ...    12   13    14   15     16  17
 0                                                WN  07/01/2015   103.0  N8607M  MDW  06:30  ...   0.0  0.0   0.0  0.0    0.0 NaN
 1                                                WN  07/01/2015   144.0  N8609A  SAN  20:55  ...   0.0  0.0   0.0  0.0    0.0 NaN

work with CSV - save to csv

 import pandas as pd
 df = pd.read_cav('xxx.csv',)
 df.to_csv('save_csv.csv') # (default) index = True

Partial Data

 df.head(n=5) # top n rows
 df.tail(n=5) # last n rows
 df[1:5] # 1 to 4 rows
 df.sample(5) # random pick n rows

Make a DataSet

 import numpy as np
 import pandas as pd
   
 np.random.seed(111)
   
 def CreateDataSet(Number=1):
     Output=[]
     for i in range(Number):
         rng = pd.date_range(start='07/20/2019', end='12/31/2019', freq='W-MON')
           
         data = np.random.randint(low=25, high=1000, size=len(rng))
           
         status = [1, 2, 3]
           
         random_status = [status[np.random.randint(low=0, high=len(status))] for i in range(len(rng))]
           
         states = ['GA', 'FL', 'f1', 'NY', 'NJ', 'TX']
           
         random_states = [states[np.random.randint(low=0, high=len(states))] for i in range(len(rng))]
           
         Output.extend(zip(random_states, random_status, data, rng))
       
     return Output

show result

Data Observation
- Index and Select Data
  - Square brackets [ ]
  - Advanced methods
    - loc
    - iloc
  - Column Access [ ]
    - Series: df['CustomerCount'].head(5)
    - DataFrame: df[['CustomerCount']].head(5)
Slicing with Pandas
- Pandas slicking
  - label-based
    DataFrame.loc
  - Integer position-based
    DataFrame.iloc
- loc
- iloc
Conditional Data

get values

 df['Status'].values
 array([1, 3, 1, 3, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 3, 1, 2, 1, 2, 1, 3, 1,
     2, 3, 2, 2, 3, 3, 1, 1, 2, 1, 2, 2, 2, 1, 3, 3, 2, 3, 1, 3, 1, 2,
     2, 3, 1, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 1, 2, 2, 1, 3, 1, 2,
     1, 3, 1, 2, 1, 2, 3, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 2, 3, 3, 1, 1,
     1, 3, 3, 2, 2, 2, 3, 3])

 # basic numerical operation
 df['Status'].values + 1
 array([2, 4, 2, 4, 3, 2, 4, 2, 3, 4, 2, 3, 3, 3, 4, 2, 3, 2, 3, 2, 4, 2,
     3, 4, 3, 3, 4, 4, 2, 2, 3, 2, 3, 3, 3, 2, 4, 4, 3, 4, 2, 4, 2, 3,
     3, 4, 2, 3, 4, 4, 3, 2, 4, 3, 2, 4, 4, 2, 3, 2, 3, 3, 2, 4, 2, 3,
     2, 4, 2, 3, 2, 3, 4, 2, 2, 2, 2, 3, 2, 2, 2, 4, 4, 3, 4, 4, 2, 2,
     2, 4, 4, 3, 3, 3, 4, 4])

Generate a vector by function

 # apply func to each column
 print(df.apply(len, axis=0))

 State            96
 Status           96
 CustomerCount    96
 StatusDate       96
 dtype: int64

 # apply func to each row
 0     4
 1     4
 2     4
 3     4
 4     4
 .     .
 .     .
 .     .

 # example

 def text(x):
    status = x['Status']
    return 'correct' if status == 1 else 'error'

 df['StatusText'] = df.apply(text, axis=1)

Imgur

group
```
 g_state = df.groupby)(['State'])
 g_state.size()
```
- 合併成一個Object
- get group data
```
 g_state.get_group('NJ')
```
- sum all the numerical data
```
 g_state.sum()
```
- group by multiple indexes and hierarchical
Combine DataFrame
- pd.concat([df1, df2])
- When the row and column do not fully match, we have to consider the strategy to merge Dataframe
  - LEFT join
    - pd.merge(left, right, on=['key1', 'key2'], how='left')
    - 用 left 合併另一邊有可能產生　NaN
  - RIGHT join
    - pd.merge(left, right, on=['key1', 'key2'], how='right')
    - 用 right 合併另一邊有可能產生　NaN
  - OUTER join
    - pd.merge(left, right, on=['key1', 'key2'], how='outer')
    - 聯集合併，不論哪邊有NaN，都並在一起
  - INNER join
    - pd.merge(left, right, on=['key1', 'key2'], how='outer')
    - 找兩邊都有的，所以不會有Nana

Example iris:

import

  import matplotlib.pyplot as plt
  import numpy as np
  from sklearn import datasets
  import pandas as pd
      
  iris = datasets.load_iris()

Imgur

load pandas

  iris_DF = pd.DataFrame(iris.data, columns = iris.feature_names)
      
  iris_DF["species"] = "temp"
  iris_DF.loc[:49, 'species'] = "setosa"
  iris_DF.loc[50:99, 'species'] = "versicolor"
  iris_DF.loc[100:149, 'species'] = "virginica"
      
  iris_DF.head()

Imgur

groupby
EDA
```
 iris_DF.iloc[:,1].describe()
```
- show box plot

Box Plot

 kind = iris_DF.columns
 iris_DF[kind].count()
 # sepal length (cm)    150
 # sepal width (cm)     150
 # petal length (cm)    150
 # petal width (cm)     150
 # species              150
 # dtype: int64

Imgur

加入百分位數
show plot

Scatter Plot

 iris_DF.plot(x = "petal length (cm)", y = "petal width (cm)", kind= "scatter")

Imgur

Pandas plot API
- iris.plot(kind=’hinst’)
- iris.plt.hist()
- iris.hist(), etc …
- Check Pandas API

EDA all data

 indices = iris_DF['species'] == 'setosa'
 setosa = iris_DF.loc[indices, :] # extract new DataFrame
   
 indices = iris_DF['species'] == 'versicolor'
 versicolor = iris_DF.loc[indices, :] # extract new DataFrame
   
 indices = iris_DF['species'] == 'virginica'
 virginica = iris_DF.loc[indices, :] # extract new DataFrame

 #                                              plpha 透明度　　條與條之間的顏色
 iris_DF.plot(kind= "hist", bins=50, range=(0, 8), alpha=0.3, edgecolor="black")
 plt.title('Entire iris data set')
 plt.xlabel('[cm]')
 #          圖片大小　　出圖的時候要用多少dpi處理 
 plt.figure(figsize=(18, 12), dpi=600)
 plt.show()

Imgur

只看 setosa

 setosa.plot(kind= "hist", bins=50, range=(0, 8), alpha=0.3, edgecolor="black")
 plt.title('Entire iris data set')
 plt.xlabel('[cm]')
 plt.figure(figsize=(18, 12), dpi=600)
 plt.show()

Imgur

Twitter Facebook LinkedIn

阿葛廷

python pandas

You May Also Enjoy

RTSP 攝影機參數最完整科普（工程師 × 商用系統）

一次 QPS 風暴的復盤：從 API 變慢到建立防禦層

有一點職業倦怠啊！

再不說些話我都快被ＡＩ淹沒拉！！！！