[[https://pandas.pydata.org/|Pandas]] is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.
import numpy as np
import pandas as pd
path = '/path/to/file(s)'
df = pd.read_csv(path + 'name.csv', sep=';', header=0, dtype={'Force String Column': str},
parse_dates=['Date Column'] encoding='utf-8')
| Variable | Meaning |
|---|---|
| sep | separator character, can be an array |
| header | row number that contains column titles. remove if none |
| dtype | force any column to be interpreted as specific datatype |
| parse_dates | parse any column as datetime |
| encoding | file encoding e.g. utf-8 or ansi |
dt.rename(columns={'File column name 1': 'Column1',
'File column name 2': '2'
}, inplace=True)
df.drop('Column A', axis=1, inplace=True)
df.loc[:, ['X']] = df.loc[:, ['X']].apply(lambda x: x.Y)
df = pd.concat([df1, df2])
df.reset_index(drop=True, inplace=True)
reset_index() is called to get rid of duplicate indices
df = df[df['A']str.match('regex')]
df = pd.merge(df1, df2, on='Column A', how='left', suffixes=('_1','_2'))
df = df.fillna(0) # To replace NaN fields with 0
df = df.sort_values('Column A', ascending = False)
pvt_df = pd.pivot_table(df,
values=['Column B', 'Column C'],
index='Column A',
aggfunc={'Column B': [np.sum, np.average],
'Column C' : lambda x: len(x)})
pvt_df .reset_index(inplace=True) # Make Column A a column again, instead of index
pvt_df .columns = [' '.join(col).strip() for col in
pvt_df.columns.values] # Flatten multi-index columns
pvt_df .rename(columns={ # Rename known numbers
'Column B average':'average_B',
'Column B sum':'sum_B',
'Column C <lambda>':'count_C'
}, inplace=True)
mask = df.loc[:,'Column X'].isnull()
df.loc[mask,['Column B', 'Column C']] = df.loc[mask,['Column A', 'Column B']].values