Pandas 学习笔记之统计篇#

一些准备工作#

[1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
# 辅助函数
def get_movie_df():
    """
    获得 movie dataframe
    """
    return pd.read_csv('datas/movies.csv', sep='\t', encoding='utf-8',thousands=',',escapechar='$')

def get_titanic_df():
    return pd.read_csv('datas/titanic.csv')

def get_iris_df():
    return pd.read_csv('datas/iris.csv')

def get_tips_df():
    return pd.read_csv('datas/tips.csv')

def get_random_df():
    return pd.DataFrame(
        np.random.randn(6, 4),
        index=pd.date_range('20200101', periods=6),
        columns=list('ABCD'))

总体描述#

[2]:
df=get_tips_df()
df.describe()  # 总体数据描述,只包括数值型数据
[2]:
total_bill tip size
count 244.000000 244.000000 244.000000
mean 19.785943 2.998279 2.569672
std 8.902412 1.383638 0.951100
min 3.070000 1.000000 1.000000
25% 13.347500 2.000000 2.000000
50% 17.795000 2.900000 2.000000
75% 24.127500 3.562500 3.000000
max 50.810000 10.000000 6.000000
[3]:
df.describe(include='all')  # 总体数据描述,包括所有类型数据
[3]:
total_bill tip sex smoker day time size
count 244.000000 244.000000 244 244 244 244 244.000000
unique NaN NaN 2 2 4 2 NaN
top NaN NaN Male No Sat Dinner NaN
freq NaN NaN 157 151 87 176 NaN
mean 19.785943 2.998279 NaN NaN NaN NaN 2.569672
std 8.902412 1.383638 NaN NaN NaN NaN 0.951100
min 3.070000 1.000000 NaN NaN NaN NaN 1.000000
25% 13.347500 2.000000 NaN NaN NaN NaN 2.000000
50% 17.795000 2.900000 NaN NaN NaN NaN 2.000000
75% 24.127500 3.562500 NaN NaN NaN NaN 3.000000
max 50.810000 10.000000 NaN NaN NaN NaN 6.000000
[4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object
 3   smoker      244 non-null    object
 4   day         244 non-null    object
 5   time        244 non-null    object
 6   size        244 non-null    int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB

计数#

DataFrame 的行数#

[5]:
len(df)
[5]:
244

分组计数 DataFrame.value_counts#

定义:

DataFrame.value_counts(
    self,
    subset: 'Sequence[Hashable] | None' = None,
    normalize: 'bool' = False,
    sort: 'bool' = True,
    ascending: 'bool' = False,
    dropna: 'bool' = True,
)

功能:

返回一个 Series , 其内容为 DataFrame 中唯一行的计数。

针对某两列分组计数#

[6]:
df[['sex', 'smoker']].value_counts()
[6]:
sex     smoker
Male    No        97
        Yes       60
Female  No        54
        Yes       33
dtype: int64
[7]:
df.value_counts(['sex', 'smoker'])
[7]:
sex     smoker
Male    No        97
        Yes       60
Female  No        54
        Yes       33
dtype: int64

分组计数显示比率并从小到大排列#

[8]:
df.value_counts(['size'], normalize=True, ascending=True)
[8]:
size
1       0.016393
6       0.016393
5       0.020492
4       0.151639
3       0.155738
2       0.639344
dtype: float64

分组计数默认不统计空值#

[9]:
df_t = get_titanic_df()
df_t.value_counts(['age'])
[9]:
age
24.00    30
22.00    27
18.00    26
30.00    25
28.00    25
         ..
20.50     1
14.50     1
12.00     1
0.92      1
80.00     1
Length: 88, dtype: int64
[10]:
df_t.value_counts(['age'], dropna=False)
[10]:
age
NaN      177
24.00     30
22.00     27
18.00     26
30.00     25
        ...
24.50      1
0.67       1
0.92       1
36.50      1
0.42       1
Length: 89, dtype: int64

分组计数 DataFrame.groupby#

[11]:
df.groupby('size').size()
[11]:
size
1      4
2    156
3     38
4     37
5      5
6      4
dtype: int64
[12]:
df.groupby('size').count()  # 因为age有空值,count不会统计空值行,所以数字会有所不同。
[12]:
total_bill tip sex smoker day time
size
1 4 4 4 4 4 4
2 156 156 156 156 156 156
3 38 38 38 38 38 38
4 37 37 37 37 37 37
5 5 5 5 5 5 5
6 4 4 4 4 4 4
[13]:
df.groupby('size')['smoker'].count()  # 因为age有空值,count不会统计空值行,所以数字会有所不同。
[13]:
size
1      4
2    156
3     38
4     37
5      5
6      4
Name: smoker, dtype: int64

透视表#

[14]:
# 按照日期和性别分别统计餐费的平均数
df.pivot_table('total_bill', 'day', 'sex', aggfunc='mean')
[14]:
sex Female Male
day
Fri 14.145556 19.857000
Sat 19.680357 20.802542
Sun 19.872222 21.887241
Thur 16.715312 18.714667

合计#

[15]:
# 统计金额
# 按行统计
df = get_tips_df()
df = df[['total_bill', 'tip']]
# 只针对某些列
# df["bill_tip"] = df[['total_bill', 'tip']].apply(lambda x:x.sum(),axis =1)
# df.loc["all"] = df[['total_bill', 'tip']].apply(lambda x:x.sum(),axis = 0)
# 按列统计
df["bill_tip"] = df.apply(lambda x:x.sum(),axis =1)
df.loc["all"] = df.apply(lambda x:x.sum(),axis = 0)
# 或者
# df.loc["all"] = df.sum()
df
[15]:
total_bill tip bill_tip
0 16.99 1.01 18.00
1 10.34 1.66 12.00
2 21.01 3.50 24.51
3 23.68 3.31 26.99
4 24.59 3.61 28.20
... ... ... ...
240 27.18 2.00 29.18
241 22.67 2.00 24.67
242 17.82 1.75 19.57
243 18.78 3.00 21.78
all 4827.77 731.58 5559.35

245 rows × 3 columns

统计常用函数#

[16]:
df = get_tips_df()
df = df[['total_bill', 'tip']]

合计#

[17]:
df.sum()
[17]:
total_bill    4827.77
tip            731.58
dtype: float64

平均#

[18]:
df.mean()
[18]:
total_bill    19.785943
tip            2.998279
dtype: float64

最大值最小值#

[19]:
df.max()
[19]:
total_bill    50.81
tip           10.00
dtype: float64
[20]:
df.min()
[20]:
total_bill    3.07
tip           1.00
dtype: float64

平均数#

[21]:
df.std()
[21]:
total_bill    8.902412
tip           1.383638
dtype: float64

中位数#

[22]:
df.median()
[22]:
total_bill    17.795
tip            2.900
dtype: float64

唯一值#

[23]:
df = get_tips_df()
df['day'].unique()
[23]:
array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)