Source code for pyveg.src.analysis_preprocessing

"""
This module consists of methods to process downloaded GEE data. The starting
point is a json file written out at the end of the downloading step. This
module cleans, resamples, and reformats the data to make it ready for analysis.

"""

import json
import math
import os

import numpy as np
import pandas as pd
import ewstools

from statsmodels.nonparametric.smoothers_lowess import lowess

from pyveg.src.data_analysis_utils import write_to_json, cball_parfit, cball

from pyveg.src.date_utils import get_time_diff

from pyveg.src.file_utils import construct_filename_from_metadata

try:
    from pyveg.src.zenodo_utils import download_results_by_coord_id
except:
    print("Unable to import zenodo_utils")

try:
    from pyveg.src import azure_utils
except:
    print("Unable to import azure_utils")


[docs]def read_results_summary(input_location, input_filename="results_summary.json", input_location_type="local"): """ Read the results_summary.json, either from local storage, Azure blob storage, or zenodo. Parameters ========== input_location: str, directory or container with results_summary.json in, or coords_id if reading from zenodo input_filename: str, name of json file, default is "results_summary.json" input_location_type: str: 'local' or 'azure' or 'zenodo' or 'zenodo_test' Returns ======= json_data: dict, the contents of results_summary.json """ if input_location_type == "local": json_filepath = os.path.join(input_location, input_filename) if not os.path.exists(json_filepath): raise FileNotFoundError("Unable to find {}".format(json_filepath)) json_data = json.load(open(json_filepath)) return json_data elif input_location_type == "zenodo" or input_location_type == "zenodo_test": use_sandbox = input_location_type == "zenodo_test" json_location = download_results_by_coord_id(input_location, "json", test=use_sandbox) if os.path.exists(json_location): json_data = json.load(open(json_location)) return json_data else: print("unable to find {} in Zenodo".format(input_location)) return {} elif input_location_type == "azure": subdirs = azure_utils.list_directory(input_location, input_location) print("Found subdirs {}".format(subdirs)) for subdir in subdirs: print("looking at subdir {}".format(subdir)) if "combine" in subdir: files = azure_utils.list_directory(input_location+"/"+subdir, input_location) if input_filename in files: return azure_utils.read_json(input_location+"/"+subdir+"/"+input_filename, input_location) else: raise RuntimeError("No {} found in {}".format(input_filename, subdir)) return {} else: raise RuntimeError("input_location_type needs to be either 'local','azure', 'zenodo' or 'zenodo_test'")
[docs]def read_json_to_dataframes(data): """ convert json data to a dict of DataFrame. Parameters ---------- data : dict, json data output from run_pyveg_pipeline Returns ---------- dict A dict of the saved results in a DataFrame format. Keys are names of collections and the values are DataFrame of results for that collection. """ # start with empty output dataframes dfs = {} # loop over collections and make a DataFrame from the results of each for collection_name, coll_results in data.items(): rows_list = [] if "time-series-data" in coll_results.keys(): # loop over time series for date, time_point in coll_results["time-series-data"].items(): # check we have data for this time point if time_point is None or time_point == {} or time_point == []: # add Null row if data is missing at this time point rows_list.append({"date": date}) # if we are looking at veg data, loop over space points elif isinstance(list(time_point)[0], dict): for space_point in time_point: # Scale NDVI values - in the image they will be between 0 and 255 to give a greyscale # (8-bit) image, but the actual NDVI values are between -1 and 1 if 'ndvi' in space_point.keys(): space_point['ndvi'] = space_point['ndvi'] * (2.0/255.0) - 1 if 'ndvi_veg' in space_point.keys(): space_point['ndvi_veg'] = space_point['ndvi_veg'] * (2.0/255.0) - 1 rows_list.append(space_point) # otherwise, just add the row else: # the key of each object in the time series is the date, and data # for this date should be the values. Here we just add the date # as a value to enable us to add the whole row in one go later. time_point["date"] = date rows_list.append(time_point) # make a DataFrame and add it to the dict of DataFrames df = pd.DataFrame(rows_list) df = df.drop(columns=["slope", "offset", "mean", "std"], errors="ignore") df = df.sort_values(by="date") assert df.empty == False dfs[collection_name] = df return dfs
[docs]def make_time_series(dfs): """ Given a dictionary of DataFrames which may contian many rows per time point (corresponding to the network centrality values of different sub-locations), collapse this into a time series by calculating the mean and std of the different sub- locations at each date. Parameters ---------- dfs : dict of DataFrame Input DataFrame read by `read_json_to_dataframes`. Returns ---------- ts_list: list of DataFrames The time-series results averaged over sub-locations. First entry will be main dataframe of vegetation and weather. Second one (if present) will be historical weather. """ # the time series dataframe ts_df = pd.DataFrame(columns=["date"]) veg_satellite_prefix = "" # loop over collections for col_name, df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # group by date to collapse all network centrality measurements groups = df.groupby("date") # get summaries means = groups.mean() stds = groups.std() # rename columns if "COPERNICUS/S2" in col_name: s = "S2_" veg_satellite_prefix = s elif "LANDSAT" in col_name: s = "L" + col_name.split("/")[1][-1] + "_" else: s = col_name + "_" veg_satellite_prefix = s means = means.rename(columns={c: s + c + "_mean" for c in means.columns}) stds = stds.rename(columns={c: s + c + "_std" for c in stds.columns}) # merge df = pd.merge(means, stds, on="date", how="inner") ts_df = pd.merge_ordered(ts_df, df, on="date", how="outer") # add climate data if availible elif "ECMWF/ERA5/" in col_name: df = df.set_index("date") ts_df = pd.merge_ordered(ts_df, df, on="date", how="outer") # remove unneeded columns ts_df = ts_df.loc[:, ~ts_df.columns.str.contains("latitude_std", case=False)] ts_df = ts_df.loc[:, ~ts_df.columns.str.contains("longitude_std", case=False)] assert ts_df.empty == False ts_list = [] # if there is a big (>10yr) gap between the start of veg and weather time-series, # we want to make a separate historic time-series. veg_col_name = [col for col in ts_df.columns if col.startswith(veg_satellite_prefix)][0] earliest_date = ts_df.iloc[0]["date"] earliest_veg_date = ts_df[ts_df[veg_col_name].notna()].iloc[0]["date"] if get_time_diff(earliest_veg_date,earliest_date) > 10: ts_df_historic = ts_df[ts_df["date"] < earliest_veg_date][["date","mean_2m_air_temperature","total_precipitation"]] ts_df = ts_df[ts_df["date"] >= earliest_veg_date] ts_list.append(ts_df) ts_list.append(ts_df_historic) else : ts_list.append(ts_df) return ts_list
[docs]def resample_time_series(series, period="MS"): """ Resample and interpolate a time series dataframe so we have one row per time period (useful for FFT) Parameters ---------- df: DataFrame Dataframe with date as index col_name: string, Identifying the column we will pull out period: string Period for resampling Returns ------- Series: pandas Series with datetime index, and one column, one row per day """ # give the series a date index if the DataFrame is not index by date already # if df.index.name != 'date': # series.index = df.date # just in case the index isn't already datetime type series.index = pd.to_datetime(series.index) # resample to get one row per time period rseries = series.resample(period).mean() new_series = rseries.interpolate() return new_series
[docs]def resample_dataframe(df, columns, period="MS"): """ Resample and interpolate a time series dataframe so we have one row per time period. Parameters ---------- df: DataFrame Dataframe with date as index. columns: list List of column names to resample. Should contain numeric data. period: string Period for resampling. Returns ------- DataFrame: DataFrame with resample time series in `columns`. """ # new empty df to deal with length mismatches after resampling df_out = pd.DataFrame() # for each column to resample for column in columns: # resample the column series = df.set_index("date")[column] df_out[column] = resample_time_series(series, period=period) # generate a clean index df_out = df_out.reset_index() return df_out
[docs]def resample_data(dfs, period="MS"): """ Resample vegetation and rainfall DataFrames. Vegetation DataFrames are resampled at the sub-image level. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. period: string Period for resampling. Returns ---------- dict of DataFrame Resampled data. """ # loop over collections for col_name, df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # specify veg columns to resample columns = [c for c in df.columns if "offset50" in c] # group by (lat, long) d = {} for name, group in df.groupby(["latitude", "longitude"]): d[name] = group # for each sub-image for key, df_ in d.items(): # resample df_ = resample_dataframe(df_, columns, period=period) # replace df d[key] = df_ # reconstruct the DataFrame df = list(d.values())[0] for df_ in list(d.values())[1:]: df = df.append(df_) # replace collection dfs[col_name] = df else: # assume ERA5 data columns = ["total_precipitation", "mean_2m_air_temperature"] # resample df_ = resample_dataframe(df_, columns, period=period) # replace df d[key] = df_ return dfs
[docs]def drop_veg_outliers(dfs, column="offset50", sigmas=3.0): """ Loop over vegetation DataFrames and drop points in the time series that a significantly far away from the mean of the time series. Such points are assumed to be unphysical. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. column : str Name of the column to drop outliers on. sigmas : float Number of standard deviations a data point has to be from the mean to be labelled as an outlier and dropped. Returns ---------- dict of DataFrame Time series data for multiple sub-image locations with some values in `column` potentially set to NaN. """ # loop over collections for col_name, veg_df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # group by (lat, long) d = {} for name, group in veg_df.groupby(["latitude", "longitude"]): d[name] = group # for each sub-image for key, df_ in d.items(): # calcualte residuals to the mean res = (df_[column] - df_[column].mean()).abs() # determine which are outliers outlier = res > df_[column].std() * sigmas # set to None df_.loc[outlier, column] = None # replace the df d[key] = df_ # reconstruct the DataFrame df = list(d.values())[0] for df_ in list(d.values())[1:]: df = df.append(df_) # replace value in df dfs[col_name] = df return dfs
[docs]def smooth_veg_data(dfs, column="offset50", n=4): """ Loop over vegetation DataFrames and perform LOESS smoothing on the time series of each sub-image. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. column : str Name of the column to drop outliers and smooth. n : int Number of neighbouring point to use in smoothing Returns ---------- dict of DataFrame Time series data for multiple sub-image locations with new column for smoothed data and ci. """ # create a new dataframe to avoid overwriting input dfs = dfs.copy() # loop over collections for col_name, df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # remove outliers and smooth df = smooth_all_sub_images(df, column=column, n=n) # calculate ci # df = get_confidence_intervals(df, column=column) # replace DataFrame dfs[col_name] = df return dfs
[docs]def smooth_subimage(df, column="offset50", n=4, it=3): """ Perform LOWESS (Locally Weighted Scatterplot Smoothing) on the time series of a single sub-image. Parameters ---------- df : DataFrame Input DataFrame containing the time series for a single sub-image. column : string, optional Name of the column in df to smooth. n : int, optional Size of smoothing window. it : int, optional Number of iterations of LOESS smoothing to perform. Returns ---------- DataFrame The time-series DataFrame with a new column containing the smoothed results. """ df.dropna(inplace=True) # add a new column of datetime objects df["datetime"] = pd.to_datetime(df["date"], format="%Y/%m/%d") # extract data xs = df["datetime"] ys = df[column] # num_days_per_timepoint = (xs.iloc[1] - xs.iloc[0]).days frac_data = min(n / len(ys), 1.0) # perform smoothing smoothed_y = lowess( ys, xs, is_sorted=True, return_sorted=False, frac=frac_data, it=it ) # add to df df[column + "_smooth"] = smoothed_y df[column + "_smooth_res"] = ys - smoothed_y return df
[docs]def smooth_all_sub_images(df, column="offset50", n=4, it=3): """ Perform LOWESS (Locally Weighted Scatterplot Smoothing) on the time series of a set of sub-images. Parameters ---------- df : DataFrame DataFrame containing time series results for all sub-images, with multiple rows per time point and (lat,long) point. column : string, optional Name of the column in df to smooth. n : int, optional Size of smoothing window. it : int, optional Number of iterations of LOESS smoothing to perform. Returns ---------- Dataframe DataFrame of results with a new column containing a LOESS smoothed version of the column `column`. """ # group by (lat, long) d = {} for name, group in df.groupby(["latitude", "longitude"]): d[name] = group # for each sub-image for key, df_ in d.items(): # perform smoothing d[key] = smooth_subimage(df_, column=column, n=n, it=it) # reconstruct the DataFrame df = list(d.values())[0] for df_ in list(d.values())[1:]: df = df.append(df_) return df
[docs]def store_feature_vectors(dfs, output_dir): """ Write out all feature vector information to a csv file, to be read later by the feature vector plotting script. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. output_dir : str Path to directory to save the csv. """ # loop over collections for col_name, veg_df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # check the feature vectors are availible if "feature_vec" not in veg_df.columns: print("Could not find feature vectors.") continue # sort by date veg_df = veg_df.sort_values(by="date").dropna() # create a df to store feature vectors df = pd.DataFrame() [ print(value) for value in veg_df.feature_vec if not isinstance(value, list) ] # add feature vectors to dataframe df = pd.DataFrame(value for value in veg_df.feature_vec) # rename percentile columns df = df.rename(columns={n: f"{(n+1)*5}th_percentile" for n in df.columns}) # reindex df.index = veg_df.index # add information df.insert(0, "date", veg_df["date"]) df.insert(1, "latitude", veg_df["latitude"]) df.insert(2, "longitude", veg_df["longitude"]) # save csv if col_name == "COPERNICUS/S2": s = "S2" elif "LANDSAT" in col_name: s = "L" + col_name.split("/")[1][-1] + "_" else: s = col_name filename = os.path.join(output_dir, s + "_feature_vectors.csv") df.to_csv(filename, index=False)
[docs]def fill_veg_gaps(dfs, missing): """ Loop through sub-image time series and replace any gaps with mean value of the same month in other years. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. missing : dict of array Missing time points where no sub-images were analyse for each veg dataframe in `dfs`. """ # loop over collections for col_name, veg_df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # group by (lat, long) d = {} for name, group in veg_df.groupby(["latitude", "longitude"]): d[name] = group # for each sub-image for key, df_ in d.items(): # get lat, long of this sub-image lats = df_.latitude.drop_duplicates().values longs = df_.longitude.drop_duplicates().values assert len(lats) == 1 assert len(longs) == 1 lat = lats[0] long = longs[0] # construct missing rows missing_rows = [pd.Series({"date": date}) for date in missing[col_name]] if len(missing_rows) == 0: continue # add back in missing values if necessary df_ = df_.append(missing_rows, ignore_index=True).sort_values(by="date") # make a new 'month' column df_["month"] = df_.date.str.split("-").str[1] # group by month and get monthly means monthly_means = df_.groupby("month").mean().offset50 # loop through dataframe for index, row in df_.iterrows(): # fill missing months with mean value if pd.isnull(row.offset50): this_month = row.month df_.loc[index, "offset50"] = monthly_means.loc[this_month] df_.loc[index, "latitude"] = lat df_.loc[index, "longitude"] = long df_.loc[index, "feature_vec"] = np.NaN # drop month column and replace old df df_ = df_.drop(columns="month") d[key] = df_ # reconstruct the DataFrame df = list(d.values())[0] for df_ in list(d.values())[1:]: df = df.append(df_) dfs[col_name] = df return dfs
[docs]def get_missing_time_points(dfs): """ Find missing time points for each vegetation dataframe in `dfs`, and return a dict, with the same key as in `dfs`, but with values corresponding to missing dates. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. Returns ---------- dict Missing time points for each vegetation df. """ # determine missing vegetation time points missing_points = {} # loop over collections for col_name, veg_df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # get the start of the vegetation time series veg_start_date = veg_df.dropna().index[0] # remove leading NaNs veg_df = veg_df.loc[veg_start_date:] # store missing time points missing_points[col_name] = veg_df.drop_duplicates( subset="date", keep=False ).date.values return missing_points
[docs]def detrend_df(df, period="MS"): """ Remove seasonality from a DataFrame containing the time series for a single sub-image. Parameters ---------- df : DataFrame Time series data for a single sub-image location. period : str, optional ` Resample time series to this frequency and then infer lag to use for deseasonalizing. Returns ---------- DataFrame Input with seasonality removed from time series columns. """ # infer lag from period, we need at least 2 years for diferenciation to work if period == "MS": lag = 12 else: raise ValueError('Periods other than "MS" are not well supported yet!') # new empty df to deal with length mismatches after resampling df_out = pd.DataFrame() # resample time series (in case not done already) columns = [ c for c in df.columns if any([s in c for s in ["offset50", "precipitation", "temperature", "ndvi"]]) ] df_out = resample_dataframe(df, columns, period=period) # detrend veg and climate columns for col in columns: df_out[col] = df_out[col].diff(lag) # need to keep this info for smoothing later try: df_out["latitude"] = df["latitude"].iloc[0] df_out["longitude"] = df["longitude"].iloc[0] except: pass return df_out
[docs]def detrend_data(dfs, period="MS"): """ Loop over each sub image time series DataFrames and remove time series seasonality by subtracting the previous year. Remove seasonality from precipitation data in the same way. Parameters ---------- dfs : dict of DataFrame Time series data for multiple sub-image locations. period : str, optional ` Resample time series to this frequency and then infer lag to use for deseasonalizing. Returns ---------- dict of DataFrame Time series data for multiple sub-image with seasonality removed. """ # don't overwrite input dfs = dfs.copy() for col_name, df in dfs.items(): #  if vegetation data if "COPERNICUS/S2" in col_name or "LANDSAT" in col_name: # group by (lat, long) d = {} for name, group in df.groupby(["latitude", "longitude"], as_index=False): d[name] = group # for each sub-image for key, df_ in d.items(): d[key] = detrend_df(df_, period) # reconstruct the DataFrame df = list(d.values())[0] for df_ in list(d.values())[1:]: df = df.append(df_) df.dropna(inplace=True) dfs[col_name] = df else: # remove seasonality for weather data, this is a simpler time series dfs[col_name] = detrend_df(dfs[col_name], period) df.dropna(inplace=True) return dfs
[docs]def preprocess_data( input_json, output_basedir, drop_outliers=True, fill_missing=True, resample=True, smoothing=True, detrend=True, n_smooth=4, period="MS", ): """ This function reads and process data downloaded by GEE. Processing can be configured by the function arguments. Processed data is written to csv. Parameters ---------- input_json : dict JSON data created during a GEE download job. output_basedir : str, Directory where time-series csv will be put. drop_outliers : bool, optional Remove outliers in sub-image time series. fill_missing : bool, optional Fill missing points in the time series. resample : bool, optional Resample the time series using linear interpolation. smoothing : bool, optional Smooth the time series using LOESS smoothing. detrend : bool, optional Remove seasonal component by subtracting previous year. n_smooth : int, optional Number of time points to use for the smoothing window size. period : str, optional Pandas DateOffset string describing sampling frequency. Returns ---------- output_dir: str Path to the csv file containing processed data. defs: dict Dictionary of dataframes. """ # put output plots in the results dir output_dir = os.path.join(output_basedir, "processed_data") # make output subdir if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) # read dict from json file to dataframes dfs = read_json_to_dataframes(input_json) # keep track of time points where data is missing (by default pandas # groupby operations, which is used haveily in this module, drop NaNs) missing = get_missing_time_points(dfs) missing_json = {k: list(v) for k, v in missing.items()} write_to_json(os.path.join(output_dir, "missing_dates.json"), missing_json) print("\nPreprocessing data...") print("-" * 21) # remove outliers from the time series if drop_outliers: print("- Dropping vegetation outliers...") dfs = drop_veg_outliers(dfs, sigmas=3) # use the same month in different years to fill gaps if fill_missing: print("- Fill gaps in sub-image time series...") dfs = fill_veg_gaps(dfs, missing) # LOESS smoothing on sub-image time series if smoothing: print("- Smoothing vegetation time series...") dfs = smooth_veg_data(dfs, n=n_smooth) # store feature vectors before averaging over sub-images print("- Saving feature vectors...") store_feature_vectors(dfs, output_dir) # average over sub-images ts_list = make_time_series(dfs) ts_df = ts_list[0] if len(ts_list) > 1 : ts_historic = ts_list[1] else : ts_historic = pd.DataFrame() # resample the averaged time series using linear interpolation if resample: print("- Resampling time series...") columns = [ c for c in ts_df.columns if any([s in c for s in ["offset50", "precipitation", "temperature"]]) ] ts_df = resample_dataframe(ts_df, columns, period=period) #  save as csv ts_filename = os.path.join(output_dir, "time_series.csv") print(f'- Saving time series to "{ts_filename}".') ts_df.to_csv(ts_filename, index=False) if not ts_historic.empty : ts_filename = os.path.join(output_dir, "time_series_historic.csv") print(f'- Saving time series to "{ts_filename}".') ts_historic.to_csv(ts_filename, index=False) # additionally save resampled & detrended time series # this detrending option (one year seasonality substraction) only works in monthly data that has at least 2 years of data if detrend and ts_df.shape[0]>24 and period=='MS': print("- Detrending time series...") # remove seasonality from sub-image time series dfs_detrended = detrend_data(dfs, period=period) print("- Smoothing vegetation time series after removing seasonality...") dfs_detrended_smooth = smooth_veg_data(dfs_detrended, n=12) # combine over sub-images ts_df_detrended_smooth = make_time_series(dfs_detrended_smooth)[0] # save output ts_filename_detrended = os.path.join(output_dir, "time_series_detrended.csv") print(f'- Saving detrended time series to "{ts_filename_detrended}".') ts_df_detrended_smooth.to_csv(ts_filename_detrended, index=False) return output_dir, dfs #  for now return `dfs` for spatial plot compatibility
[docs]def save_ts_summary_stats(ts_dirname, output_dir, metadata): """ Given a time series DataFrames (constructed with `make_time_series`), give summary statistics of all the avalaible time series. Parameters ---------- ts_dirname : str Directory where the time series are saved. output_dir : str Directory to save the plots in. metadata: dict Dictionary with metadata from location """ # read processed data # get filenames of preprocessed data time series ts_filenames = [f for f in os.listdir(ts_dirname) if "time_series" in f] # we should get one seasonal time series and a detrended one ts_df_detrended = pd.DataFrame() ts_df_historic = pd.DataFrame() for filename in ts_filenames: if "detrended" in filename: ts_df_detrended = pd.read_csv(os.path.join(ts_dirname,filename)) elif "historic" in filename: ts_df_historic = pd.read_csv(os.path.join(ts_dirname,filename)) else: ts_df = pd.read_csv(os.path.join(ts_dirname,filename)) def get_ts_summary_stats(series): ''' Function that gets the summary stats of the time series and returns a dictionary''' stats_dict = {} stats_dict['min'] = series.min() stats_dict['max'] = series.max() stats_dict['mean'] = series.mean() stats_dict['median'] = series.median() stats_dict['std'] = series.std() return stats_dict # calculate summary statistics for each relevant time series ts_dict_list = [] # only look at relevant time series (offset50, ndvi and precipitation) if not ts_df_historic.empty : column_dict = get_ts_summary_stats(ts_df_historic["total_precipitation"]) column_dict["ts_id"] = "total_precipitation_historic" for key in metadata: column_dict[key] = metadata[key] ts_dict_list.append(column_dict) column_names = [c for c in ts_df.columns if 'offset50_mean' in c or 'ndvi_mean' in c or 'total_precipitation' in c] for column in column_names: print(f'Calculating summary stats for "{column}"...') column_dict = get_ts_summary_stats(ts_df[column]) column_dict['ts_id'] = column # make sure is a dated time series for resampling later in the CB calculation ts_df[column].index = pd.DatetimeIndex(ts_df['date']) cb_params, sucess, residuals = cball_parfit([1.5, 150.0, 8.0, 2.0],ts_df[column],column, output_dir) column_dict['CB_fit_success'] = sucess column_dict['CB_fit_residuals'] = residuals column_dict['CB_fit_alpha'] = cb_params[0] column_dict['CB_fit_N'] = cb_params[1] column_dict['CB_fit_xbar'] = cb_params[2] column_dict['CB_fit_sigma'] = cb_params[3] for key in metadata: column_dict[key] = metadata[key] # We want the AR1 and Standard deviation of the detreded timeseries for the summary stats if ts_df_detrended.empty==False: ews_dic_veg = ewstools.core.ews_compute(ts_df_detrended[column].dropna(), roll_window=0.999 , smooth='Gaussian', lag_times=[1], ews= ["var", "ac"], band_width=6) EWSmetrics_df = ews_dic_veg['EWS metrics'] column_dict["Lag-1 AC (0.99 rolling window)"] = EWSmetrics_df["Lag-1 AC"].iloc[-1] column_dict["Variance (0.99 rolling window)"] = EWSmetrics_df["Variance"].iloc[-1] ews_dic_veg_50 = ewstools.core.ews_compute(ts_df_detrended[column].dropna(), roll_window=0.5, smooth='Gaussian', lag_times=[1], ews=["var", "ac"], band_width=6) Kendall_tau_50 = ews_dic_veg_50['Kendall tau'] column_dict["Kendall tau Lag-1 AC (0.5 rolling window)"] = Kendall_tau_50["Lag-1 AC"].iloc[-1] column_dict["Kendall tau Variance (0.5 rolling window)"] = Kendall_tau_50["Variance"].iloc[-1] # We also want the AR1 and Standard deviation of the raw seasonal timeseries for the summary stats if ts_df.empty == False: # make sure in this case that the index is numeric and not datetime ts = ts_df[column].dropna() ts.index = pd.to_numeric(ts.index) ews_dic_veg_seasonal = ewstools.core.ews_compute(ts, roll_window=0.999, smooth='None', lag_times=[1], ews=["var", "ac"]) EWSmetrics_df_seasonal = ews_dic_veg_seasonal['EWS metrics'] column_dict["Lag-1 AC (0.99 rolling window) Seasonal"] = EWSmetrics_df_seasonal["Lag-1 AC"].iloc[-1] column_dict["Variance (0.99 rolling window) Seasonal"] = EWSmetrics_df_seasonal["Variance"].iloc[-1] ews_dic_veg_50_seasonal = ewstools.core.ews_compute(ts, roll_window=0.5, smooth='None', lag_times=[1], ews=["var", "ac"]) Kendall_tau_50_seasonal = ews_dic_veg_50_seasonal['Kendall tau'] column_dict["Kendall tau Lag-1 AC (0.5 rolling window) Seasonal"] = Kendall_tau_50_seasonal["Lag-1 AC"].iloc[-1] column_dict["Kendall tau Variance (0.5 rolling window) Seasonal"] = Kendall_tau_50_seasonal["Variance"].iloc[-1] ts_dict_list.append(column_dict) ss_name = construct_filename_from_metadata(metadata, "summary_stats.csv") # turn the list of dictionary to dataframe and save it ts_df_summary = pd.DataFrame(ts_dict_list) #save both name specific and generic (might be useful inside the analysis later) ts_df_summary.to_csv(os.path.join(output_dir, "time_series_summary_stats.csv")) ts_df_summary.to_csv(os.path.join(output_dir, ss_name))