Friday, July 1, 2022
Python Script for Data Recon
from collections import OrderedDict as od
import pandas as pd
def diff_func(df_left, df_right, uid, labels=('Left', 'Right'), drop=[[],[]]):
dict_df = {labels[0]: df_left, labels[1]: df_right}
col_left = df_left.columns.tolist()
col_right = df_right.columns.tolist()
# There could be columns known to be different, hence allow user to pass this as a list to be dropped.
if drop[0] or drop[1]:
print ('{}: Ignoring columns {} in comparison.'.format(labels[0], ', '.join(drop[0])))
print ('{}: Ignoring columns {} in comparison.'.format(labels[1], ', '.join(drop[1])))
col_left = list(filter(lambda x: x not in drop[0], col_left))
col_right = list(filter(lambda x: x not in drop[1], col_right))
df_left = df_left[col_left]
df_right = df_right[col_right]
# Step 1 - Check if no. of columns are the same:
len_lr = len(col_left), len(col_right)
assert len_lr[0]==len_lr[1], \
'Cannot compare frames with different number of columns: {}.'.format(len_lr)
# Step 2a - Check if the set of column headers are the same
# (order doesnt matter)
assert set(col_left)==set(col_right), \
'Left column headers are different from right column headers.' \
+'\n Left orphans: {}'.format(list(set(col_left)-set(col_right))) \
+'\n Right orphans: {}'.format(list(set(col_right)-set(col_left)))
# Step 2b - Check if the column headers are in the same order
if col_left != col_right:
print ('[Note] Reordering right Dataframe...')
df_right = df_right[col_left]
# Step 3 - Check datatype are the same [Order is important]
if all(df_left.dtypes == df_right.dtypes):
print ('DataType check: Passed')
else:
print ('dtypes are not the same.')
df_dtypes = pd.DataFrame({labels[0]:df_left.dtypes,labels[1]:df_right.dtypes,'Diff':(df_left.dtypes == df_right.dtypes)})
df_dtypes = df_dtypes[df_dtypes['Diff']==False][[labels[0],labels[1],'Diff']]
print (df_dtypes)
# Step 4 - Check for duplicate rows
for key, df in dict_df.items():
if df.shape[0] != df.drop_duplicates().shape[0]:
print(key + ': Duplicates exists, they will be dropped.')
dict_df[key] = df.drop_duplicates()
# Step 5 - Check for duplicate uids.
if isinstance(uid, (str, list)):
print ('Uniqueness check: {}'.format(uid))
for key, df in dict_df.items():
count_uid = df.shape[0]
count_uid_unique = df[uid].drop_duplicates().shape[0]
dp = [0,1][count_uid_unique == df.shape[0]] #<-- Round off to the nearest integer if it is 100%
pct = round(100*count_uid_unique/df.shape[0], dp)
print ('{}: {} out of {} are unique ({}%).'.format(key, count_uid_unique, count_uid, pct))
# Checks complete, begin merge.
d_result = od()
d_result[labels[0]] = df_left
d_result[labels[1]] = df_right
if all(df_left.eq(df_right).all()):
print('Trival case: DataFrames are an exact match.')
d_result['Merge'] = df_left.copy()
else:
df_merge = pd.merge(df_left, df_right, on=col_left, how='inner')
if not df_merge.shape[0]:
print('Trival case: Merged DataFrame is empty')
d_result['Merge'] = df_merge
if type(uid)==str:
uid = [uid]
if type(uid)==list:
df_left_only = df_left.append(df_merge).reset_index(drop=True)
df_left_only['Duplicated']=df_left_only.duplicated(keep=False) #keep=False, marks all duplicates as True
df_left_only = df_left_only[~df_left_only['Duplicated']]
df_right_only = df_right.append(df_merge).reset_index(drop=True)
df_right_only['Duplicated']=df_right_only.duplicated(keep=False)
df_right_only = df_right_only[~df_right_only['Duplicated']]
label = '{} or {}'.format(*labels)
df_lc = df_left_only.copy()
df_lc[label] = labels[0]
df_rc = df_right_only.copy()
df_rc[label] = labels[1]
df_c = df_lc.append(df_rc).reset_index(drop=True)
df_c['Duplicated'] = df_c.duplicated(subset=uid, keep=False)
df_c1 = df_c[df_c['Duplicated']]
df_c1 = df_c1.drop('Duplicated', axis=1)
cols = df_c1.columns.tolist()
df_c1 = df_c1[[cols[-1]]+cols[:-1]]
df_uc = df_c[~df_c['Duplicated']]
df_uc_left = df_uc[df_uc[label]==labels[0]]
df_uc_right = df_uc[df_uc[label]==labels[1]]
d_result[labels[0]+'_only'] = df_uc_left.drop(['Duplicated', label], axis=1)
d_result[labels[1]+'_only'] = df_uc_right.drop(['Duplicated', label], axis=1)
d_result['Diff'] = df_c1.sort_values(uid).reset_index(drop=True)
return d_result
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment