open(get_path("locales.json"), "wt")asfp:dump(locales, fp, indent=2, sort_keys=True)print("==> Done generating locales file.")ex0_gen_locales()ex0_gen_soln(force=False)### END HIDDEN TESTSdefex0_random_value():fromrandomimportrandom, randint, choicefromnumpyimportnanfromproblem_utilsimportex0_random_date, ex0_random_stringoptions = [randint(-100, 100)# int, ex0_random_string(randint(1, 10))# string, ex0_random_date(),# date'',# implicit NaNnan# explicit NaN]returnchoice(options)defex0_get_locales(filename=get_path("locales.json")):fromjsonimportloadwithopen(filename, "rt")asfp:locales = load(fp)returnlocalesdefex0_gen_row(locales, num_dummies=0):fromdatetimeimportdatetimefromrandomimportchoice, random, randintfromnumpyimportnanfromproblem_utilsimportex0_random_datecountry = choice(list(locales.keys()))province = nanifrandom() <= 0.1elsechoice(locales[country])confirmed = 0ifrandom() <= 0.1elserandint(1, 100000)last_updated = ex0_random_date()ifnum_dummies:dummy_vals = tuple([ex0_random_value()for_inrange(num_dummies)])else:dummy_vals = ()
11/1/2020main2/pmt2-sample-solutions (1)/midterm2/problem18/problem18.html14/59return(country, province, confirmed, last_updated, *dummy_vals)defex0_gen_df():fromrandomimportrandint, randomfrompandasimportDataFramefromproblem_utilsimportex0_random_stringlocales = ex0_get_locales()# Generate random columns, which the student should ignorenum_dummy_cols = randint(1, 4)dummy_cols = []whilelen(dummy_cols) != num_dummy_cols:dummy_cols = list({ex0_random_string(5)for_inrange(num_dummy_cols)})# Generate a bunch of random rowsnum_trials = randint(10, 50)rows = [ex0_gen_row(locales, num_dummy_cols)for_inrange(num_trials)]# Remove any initial duplicatesrows = sorted(rows, key=lambdax: repr(x))rows_soln = [rows[0]]forrinrows[1:]:ifrepr(r) != repr(rows_soln[-1]):rows_soln.append(r)# Construct the solution tibblecols_in = ["Country/Region"ifrandom() < 0.75else"Country_Region","Province/State"ifrandom() < 0.75else"Province_State","Confirmed","Last Update"ifrandom() < 0.75else"Last_Update"]cols_out = ["Country/Region", "Province/State", "Confirmed", "Last Update"]df_soln = DataFrame(rows_soln, columns=cols_out + dummy_cols)[cols_out] \.rename(columns={"Last Update": "Timestamp"})# Generate a corresponding input tibblerows_in = []forrinrows_soln:s = list(r)ifs[2] == 0:s[2] = ''# NaN countsr_in = tuple(s)rows_in.append(r_in)ifrandom() <= 0.15:# Random duplicatesfor_inrange(randint(1, 4)):rows_in.append(r_in)df_in = DataFrame(rows_in, columns=cols_in + dummy_cols)returndf_in, df_solndefex0_split_df(df, max_splits=5):fromrandomimportrandintfromnumpyimportarange, sort, appendfromnumpy.randomimportshuffle, choice# Shuffle the rows
11/1/2020main2/pmt2-sample-solutions (1)/midterm2/problem18/problem18.html15/59df = df.sample(frac=1).reset_index(drop=True)# Split the rowsdf_split = []num_splits = min(randint(0, max_splits), len(df))ifnum_splits > 0:split_inds = sort(choice(arange(len(df)), size=num_splits, replace=False))ifsplit_inds[0] > 0:split_inds = append(0, split_inds)ifsplit_inds[-1] < len(df):split_inds = append(split_inds, len(df))fori, jin
Upload your study docs or become a
Course Hero member to access this document
Upload your study docs or become a
Course Hero member to access this document
End of preview. Want to read all 59 pages?
Upload your study docs or become a
Course Hero member to access this document
Term
Fall
Professor
DaKuang
Tags
Extract transform load