import os
import timeit
import statistics
import pandas as pd
import tables
filename = 'test_csv'
df = pd.read_csv(filename + '.csv')
print(df.sample(5))
print(df.shape)
print(df.dtypes)
compress_list=[
# CSV + INDEX
{
'method_name':"CSV",'save_format':"df.to_csv(filename + '_csv.csv')",
'read_format':"pd.read_csv(filename + '_csv.csv')",'suffix':"_csv.csv",'method_describe':"有索引的CSV文件格式"
},
# CSV - INDEX
{
'method_name':"CSV No Index",'save_format':"df.to_csv(filename + '_csv.csv', index=False)",
'read_format':"pd.read_csv(filename + '_csv.csv')",'suffix':"_csv.csv",'method_describe':"无索引的CSV文件格式"
},
# CSV No Index (GZIP)
{
'method_name':"CSV No Index (GZIP)",'save_format':"df.to_csv(filename + '.gzip', index=False, compression='gzip')",
'read_format':"pd.read_csv(filename + '.gzip', compression='gzip')",'suffix':".gzip",'method_describe':"gzip压缩格式的无索引CSV"
},
# CSV No Index (BZ2)
{
'method_name':"CSV No Index (BZ2)",'save_format':"df.to_csv(filename + '.bz2', index=False, compression='bz2')",
'read_format':"pd.read_csv(filename + '.bz2', compression='bz2')",'suffix':".bz2",'method_describe':"bz2压缩格式的无索引CSV"
},
# CSV No Index (ZIP)
{
'method_name':"CSV No Index (ZIP)",'save_format':"df.to_csv(filename + '.zip', index=False, compression='zip')",
'read_format':"pd.read_csv(filename + '.zip', compression='zip')",'suffix':".zip",'method_describe':"zip压缩格式的无索引CSV"
},
# CSV No Index (XZ)
{
'method_name':"CSV No Index (XZ)",'save_format':"df.to_csv(filename + '.xz', index=False, compression='xz')",
'read_format':"pd.read_csv(filename + '.xz', compression='xz')",'suffix':".xz",'method_describe':"xz压缩格式的无索引CSV"
},
# JSON
{
'method_name':"JSON",'save_format':"df.to_json(filename + '.json')",
'read_format':"pd.read_json(filename + '.json')",'suffix':".json",'method_describe':"json序列化"
},
# JSON (GZIP)
{
'method_name':"JSON(GZIP)",'save_format':"df.to_json(filename + '.json', compression='gzip')",
'read_format':"pd.read_json(filename + '.json', compression='gzip')",'suffix':".json",'method_describe':"gzip压缩格式的json序列化"
},
# JSON (BZ2)
{
'method_name':"JSON(BZ2)",'save_format':"df.to_json(filename + '.json', compression='bz2')",
'read_format':"pd.read_json(filename + '.json', compression='bz2')",'suffix':".json",'method_describe':"bz2压缩格式的json序列化"
},
# JSON (ZIP)
{
'method_name':"JSON(ZIP)",'save_format':"df.to_json(filename + '.json', compression='zip')",
'read_format':"pd.read_json(filename + '.json', compression='zip')",'suffix':".json",'method_describe':"zip压缩格式的json序列化"
},
# JSON (XZ)
{
'method_name':"JSON(XZ)",'save_format':"df.to_json(filename + '.json', compression='xz')",
'read_format':"pd.read_json(filename + '.json', compression='xz')",'suffix':".json",'method_describe':"xz压缩格式的json序列化"
},
# Pickle
{
'method_name':"Pickle",'save_format':"df.to_pickle(filename + '.pkl')",
'read_format':"pd.read_pickle(filename + '.pkl')",'suffix':".pkl",'method_describe':"二进制序列化Pickle"
},
# Pickle (GZIP)
{
'method_name':"Pickle (GZIP)",'save_format':"df.to_pickle(filename + '.pkl', compression='gzip')",
'read_format':"pd.read_pickle(filename + '.pkl', compression='gzip')",'suffix':".pkl",'method_describe':"gzip压缩的序列化Pickle"
},
# Pickle (BZ2)
{
'method_name':"Pickle (BZ2)",'save_format':"df.to_pickle(filename + '.pkl', compression='bz2')",
'read_format':"pd.read_pickle(filename + '.pkl', compression='bz2')",'suffix':".pkl",'method_describe':"bz2压缩的序列化Pickle"
},
# Pickle (ZIP)
{
'method_name':"Pickle (ZIP)",'save_format':"df.to_pickle(filename + '.pkl', compression='zip')",
'read_format':"pd.read_pickle(filename + '.pkl', compression='zip')",'suffix':".pkl",'method_describe':"zip压缩的序列化Pickle"
},
# Pickle (XZ)
{
'method_name':"Pickle (XZ)",'save_format':"df.to_pickle(filename + '.pkl', compression='xz')",
'read_format':"pd.read_pickle(filename + '.pkl', compression='xz')",'suffix':".pkl",'method_describe':"xz压缩的序列化Pickle"
},
# HDF+不压缩
{
'method_name':"HDF+不压缩",'save_format':"df.to_hdf(filename + '.h5', key='key', mode='w',complevel=0)",
'read_format':"pd.read_hdf(filename + '.h5', key='key', mode='r')",'suffix':".h5",'method_describe':"不压缩的HDF5格式"
},
# HDF+浅压缩
{
'method_name':"HDF+浅压缩",'save_format':"df.to_hdf(filename + '.h5', key='key', mode='w',complevel=3)",
'read_format':"pd.read_hdf(filename + '.h5', key='key', mode='r')",'suffix':".h5",'method_describe':"3级压缩的HDF5格式"
},
# HDF+深压缩
{
'method_name':"HDF+深压缩",'save_format':"df.to_hdf(filename + '.h5', key='key', mode='w',complevel=6)",
'read_format':"pd.read_hdf(filename + '.h5', key='key', mode='r')",'suffix':".h5",'method_describe':"6级压缩的HDF5格式"
},
# HDF+极限压缩
{
'method_name':"HDF+极限压缩",'save_format':"df.to_hdf(filename + '.h5', key='key', mode='w',complevel=9)",
'read_format':"pd.read_hdf(filename + '.h5', key='key', mode='r')",'suffix':".h5",'method_describe':"9级压缩的HDF5格式"
},
# Parquet(snappy)
{
'method_name':"Parquet(snappy)",'save_format':"df.to_parquet(filename + '.parquet', engine='fastparquet', compression='snappy')",
'read_format':"pd.read_parquet(filename + '.parquet')",'suffix':".parquet",'method_describe':"snappy压缩的Parquet格式"
},
# Parquet(gzip)
{
'method_name':"Parquet(gzip)",'save_format':"df.to_parquet(filename + '.parquet', engine='fastparquet', compression='gzip')",
'read_format':"pd.read_parquet(filename + '.parquet')",'suffix':".parquet",'method_describe':"gzip压缩的Parquet格式"
},
# Parquet(brotli)
{
'method_name':"Parquet(brotli)",'save_format':"df.to_parquet(filename + '.parquet', engine='fastparquet', compression='brotli')",
'read_format':"pd.read_parquet(filename + '.parquet')",'suffix':".parquet",'method_describe':"brotli压缩的Parquet格式"
},
]
def average(l):
sum = 0
for i in l:
sum += i
return round(sum/len(l), 5)
def compress_performance(df_results, compress_detail):
#---saving---
# result_save = %timeit -n5 -r5 -o eval(compress_detail['save_format'])
result_save = timeit.repeat(
"eval(compress_detail['save_format'])",
repeat=5,
number=5,
globals=globals())
#---get the size of file---
file_size = os.path.getsize(filename + compress_detail['suffix']) / 1024**2
#---load---
# result_read = %timeit -n5 -r5 -o eval(compress_detail['read_format'])
result_read = timeit.repeat(
"eval(compress_detail['read_format'])",
repeat=5,
number=5,
globals=globals())
#---save the result to the dataframe---
# import pdb; pdb.set_trace()
row = {
'method_name': compress_detail['method_name'],
'file_size': file_size,
'write_time_mean': average(result_save),
'write_time_std': round(statistics.stdev(result_save)),
'read_time_mean': average(result_read),
'read_time_std': round(statistics.stdev(result_read)),
'method_describe': compress_detail['method_describe'],
}
# return df_results.append(pd.Series(row), ignore_index = True)
return pd.concat([df_results, pd.DataFrame.from_records([row])], ignore_index=True)
compress_results = pd.DataFrame(
columns=[
'method_name','file_size', 'write_time_mean', 'write_time_std',
'read_time_mean','read_time_std','method_describe'
])
for compress_detail in compress_list:
print('start check compress_method: '+compress_detail['method_name'])
compress_results = compress_performance(compress_results, compress_detail)
df = compress_results.copy()
# df = df.apply(lambda x: x.str.replace('\n', '<br>') if x.dtype == 'object' else x)
md_table = df.to_markdown()
print(md_table)