与SQL比较
常用方法
### SELECT tips[['total_bill', 'tip', 'smoker', 'time']].head(5) ### WHERE tips[tips['time'] == 'Dinner'].head(5) tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)] frame[frame['col2'].isna()] frame[frame['col1'].notna()] ### GROUP BY tips.groupby('sex')['total_bill'].count() tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) ### JOIN #### INNER JOIN pd.merge(df1, df2, on='key') #### LEFT OUTER JOIN pd.merge(df1, df2, on='key', how='left') #### RIGHT JOIN pd.merge(df1, df2, on='key', how='right') #### FULL JOIN pd.merge(df1, df2, on='key', how='outer') ### UNION pd.concat([df1, df2]) pd.concat([df1, df2]).drop_duplicates() ### 更新(UPDATE) tips.loc[tips['tip'] < 2, 'tip'] *= 2 ### 删除(DELETE) tips = tips.loc[tips['tip'] <= 9]
Pandas等同于某些SQL分析和聚合函数
每组前N行
-- Oracle's ROW_NUMBER() analytic function SELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t ) WHERE rn < 3 ORDER BY day, rn;
In [35]: (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False) ....: .groupby(['day']) ....: .cumcount() + 1) ....: .query('rn < 3') ....: .sort_values(['day', 'rn'])) ....: Out[35]: total_bill tip sex smoker day time size rn 95 40.17 4.73 Male Yes Fri Dinner 4 1 90 28.97 3.00 Male Yes Fri Dinner 2 2 170 50.81 10.00 Male Yes Sat Dinner 3 1 212 48.33 9.00 Male No Sat Dinner 4 2 156 48.17 5.00 Male No Sun Dinner 6 1 182 45.35 3.50 Male Yes Sun Dinner 3 2 197 43.11 5.00 Female Yes Thur Lunch 4 1 142 41.19 5.00 Male No Thur Lunch 5 2
同样使用 rank (method ='first') 函数
In [36]: (tips.assign(rnk=tips.groupby(['day'])['total_bill'] ....: .rank(method='first', ascending=False)) ....: .query('rnk < 3') ....: .sort_values(['day', 'rnk'])) ....: Out[36]: total_bill tip sex smoker day time size rnk 95 40.17 4.73 Male Yes Fri Dinner 4 1.0 90 28.97 3.00 Male Yes Fri Dinner 2 2.0 170 50.81 10.00 Male Yes Sat Dinner 3 1.0 212 48.33 9.00 Male No Sat Dinner 4 2.0 156 48.17 5.00 Male No Sun Dinner 6 1.0 182 45.35 3.50 Male Yes Sun Dinner 3 2.0 197 43.11 5.00 Female Yes Thur Lunch 4 1.0 142 41.19 5.00 Male No Thur Lunch 5 2.0
-- Oracle's RANK() analytic function SELECT * FROM ( SELECT t.*, RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk FROM tips t WHERE tip < 2 ) WHERE rnk < 3 ORDER BY sex, rnk;
让我们找到每个性别组(等级<3)的提示(提示<2)。请注意,使用rank(method='min')
函数时 rnk_min对于相同的提示保持不变 (如Oracle的RANK()函数)
In [37]: (tips[tips['tip'] < 2] ....: .assign(rnk_min=tips.groupby(['sex'])['tip'] ....: .rank(method='min')) ....: .query('rnk_min < 3') ....: .sort_values(['sex', 'rnk_min'])) ....: Out[37]: total_bill tip sex smoker day time size rnk_min 67 3.07 1.00 Female Yes Sat Dinner 1 1.0 92 5.75 1.00 Female Yes Fri Dinner 2 1.0 111 7.25 1.00 Female No Sat Dinner 1 1.0 236 12.60 1.00 Male Yes Sat Dinner 2 1.0 237 32.83 1.17 Male Yes Sat Dinner 2 2.0