Source code for running.merger

import glob
import json
import logging
import os
import re
import shutil
from bisect import bisect_left, bisect_right
from math import sqrt, ceil
from pathlib import Path

import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from crawlers.cbasic import Crawler
from crawlers.declarable import declaration_to_filename
from running.metrics import Metric
from utils import RESULT_DIR


def compute_aucc(xs, ys):
    # from sklearn.metrics import auc
    # return auc(xs, ys)
    assert len(xs) == len(ys) > 0
    xs = xs / xs[-1]
    res = xs[0] * ys[0] / 2
    for i in range(1, len(xs)):
        res += (xs[i] - xs[i-1]) * (ys[i-1] + ys[i]) / 2
    return res


def compute_waucc(xs, ys):
    # res = compute_aucc(np.log(xs), ys)
    assert len(xs) == len(ys) > 0
    xs = xs / xs[-1]
    res = 0 if xs[0] == 0 else ys[0]
    norm = 0 if xs[0] == 0 else 1
    for i in range(1, len(xs)):
        res += (xs[i] - xs[i-1]) * (ys[i-1] + ys[i]) / 2 / xs[i]
        norm += (xs[i] - xs[i-1]) / xs[i]
    return res / norm


def compute_targets_crawled(xs, ys):
    assert len(xs) == len(ys) > 0
    return ys[-1]


def average(array, median=False, **kwargs):
    if len(array) == 0:
        return np.nan
    return (np.median if median else np.mean)(array, **kwargs)


def variance(array, **kwargs):
    if len(array) == 0:
        return np.nan
    return np.var(array, **kwargs)


LINESTYLES = ['-', '--', ':', '-.']
COLORS = ['black', 'b', 'g', 'r', 'c', 'm', 'y',
          'darkblue', 'darkgreen', 'darkred', 'darkmagenta', 'darkorange', 'darkcyan',
          'pink', 'lime', 'wheat', 'lightsteelblue']


[docs]class ResultsMerger: """ ResultsMerger can aggregate and plot results saved in files. Process all combinations of G graphs x C crawlers x M metrics. Averages over n instances of each. All missed instances are just ignored. Plotting functions: * draw_by_crawler - Draw M x G table of plots with C lines each. Ox - crawling step, Oy - metric value. * draw_by_metric_crawler - Draw G plots with C x M lines each. Ox - crawling step, Oy - metric value. * draw_by_metric - Draw C x G table of plots with M lines each. Ox - crawling step, Oy - metric value. * draw_aggregated - Draw G plots with M lines. Ox - C crawlers, Oy - (w)AUCC value (M curves with error bars). * draw_winners - Draw C stacked bars (each of M elements). Ox - C crawlers, Oy - number of wins (among G) by (w)AUCC value. Additional functions: * missing_instances - Calculate how many instances of all configurations are missing. * move_folders - Move/remove/copy saved instances for current graphs, crawlers, metrics. NOTES: * x values must be the same for all files and are the ones generated by `exponential_batch_generator()` from running/runner.py * it is supposed that for all instances values lists are of equal lengthes (i.e. budgets). Otherwise normalisation and aggregation may fail. If so, use `x_lims` parameter for the control. """
[docs] def __init__(self, graph_full_names, crawler_decls, metric_decls, budget, n_instances=None, x_lims=None, result_dir=RESULT_DIR, numeric_only=True): """ :param graph_full_names: list of graphs full names. :param crawler_decls: list of crawlers declarations. :param metric_decls: list of metrics declarations. Non-numeric metrics will be ignored. :param budget: results with this budget will be taken. :param n_instances: number of instances to average over, None for all. :param x_lims: use only specified x-limits for all plots unless another value is specified in plotting function. :param result_dir: specify if want to use non-default directory where results are stored. """ self.graph_full_names = graph_full_names self.crawler_names = [] # list(map(declaration_to_filename, crawler_decls)) self.metric_names = [] # list(map(declaration_to_filename, metric_decls)) self.labels = {} # pretty short names to draw in plots # Generate pretty names for crawlers and metrics for plots for md in metric_decls: m = Metric.from_declaration(md, graph=None) if numeric_only and not m.is_numeric: # Ignore non-numeric metrics continue f = declaration_to_filename(m.declaration) self.metric_names.append(f) self.labels[f] = m.name for cd in crawler_decls: c = Crawler.from_declaration(cd, graph=None) f = declaration_to_filename(c.declaration) self.crawler_names.append(f) self.labels[f] = c.name self.budget = budget self.n_instances = n_instances self.x_lims = x_lims self.instances = {} # instances[graph][crawler][metric] -> count of instances # contents[graph][crawler][metric]: # 'x' -> [nums of steps], # 'ys' -> [[y for each step] for each instance], # 'avy' -> [avg y for each step] self.contents = {} # auccs[graph][crawler][metric]: # 'AUCC' -> [AUCC for each instance], # 'wAUCC' -> [wAUCC for each instance] self.auccs = {} self.result_dir = result_dir self._read() plt.style.use('seaborn')
[docs] @staticmethod def names_to_path(graph_full_name: tuple, crawler_name: str, metric_name: str, budget: int, result_dir=RESULT_DIR): """ Returns file pattern e.g. '/home/misha/workspace/crawling/results/ego-gplus/POD(batch=1)/TopK(centrality=BtwDistr,measure=Re,part=crawled,top=0.01)/\*.json' """ # TODO apply # path = Path( # result_dir, *graph_full_name, crawler_name, metric_name, f"budget={budget}", "*.json") path = Path(result_dir, *graph_full_name, crawler_name, metric_name, "*.json") return path
def _read(self): total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names) pbar = tqdm(total=total, desc='Reading history') self.instances.clear() # self.contents.clear() for g in self.graph_full_names: self.instances[g] = {} self.contents[g] = {} for c in self.crawler_names: self.instances[g][c] = {} self.contents[g][c] = {} for m in self.metric_names: # TODO apply # path = ResultsMerger.names_to_path(g, c, m, self.budget, self.result_dir) # fn_pattern = re.compile(f'(\d+)\.json') # paths = [] # for file in path.parent.iterdir(): # m = re.findall(fn_pattern, file.name) # if m: # print(file.name, m[0][0]) # paths.append(file) # paths = sorted(paths)[:self.n_instances] path_pattern = ResultsMerger.names_to_path(g, c, m, self.result_dir) # FIXME workaround for glob since '[' is a special symbol for it path_pattern = str(path_pattern).replace('[', '[[]') paths = glob.glob(path_pattern)[:self.n_instances] paths = sorted(paths) self.instances[g][c][m] = len(paths) self.contents[g][c][m] = contents = {} count = len(paths) contents['x'] = [] contents['ys'] = ys = [[]]*count contents['avy'] = [] i0 = 0 i1 = None for inst, p in enumerate(paths): with open(p, 'r') as f: imported = json.load(f) if len(contents['x']) == 0: xs = np.array(sorted([int(x) for x in list(imported.keys())]))[i0: i1] if self.x_lims: # Cut over x_lims x0, x1 = self.x_lims i0 = bisect_left(xs, x0) i1 = bisect_right(xs, x1)+1 contents['x'] = xs if inst == 0: contents['avy'] = np.zeros(len(xs)) try: # Convert to float and compute average if possible ys[inst] = np.array([float(x) for x in list(imported.values())])[i0: i1] contents['avy'] += np.array(ys[inst]) / count except TypeError: # Non-numeric values - as is ys[inst] = np.array(list(imported.values()))[i0: i1] pbar.update(1) pbar.close()
[docs] def move_folders(self, path_from=None, path_to=None, copy=False): """ Move/remove/copy all saved instances for current [graphs X crawlers X metrics]. Specify `path_to` parameter to move files instead of removing. :param path_from: this folder is root for all folders to be (re)moved, must be contained in path to folders :param path_to: this folder is the destination for all folders to be moved. If None (which is default), all folders will be removed. :param copy: set to True if want to copy folders """ if path_from is None: path_from = self.result_dir path_from = str(path_from) path_to = str(path_to) move_or_copy = shutil.copytree if copy else shutil.move total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names) pbar = tqdm(total=total, desc='(Re)moving history') folder = None removed = 0 removed_empty = 0 moved = 0 from os.path import dirname as parent from os.path import exists as exist for g in self.graph_full_names: for c in self.crawler_names: for m in self.metric_names: folder = str(ResultsMerger.names_to_path(g, c, m, self.budget, self.result_dir).parent) if not exist(folder): continue if path_to is None: # remove shutil.rmtree(folder, ignore_errors=True) removed += 1 else: # move assert path_from in folder dst = folder.replace(path_from, path_to) move_or_copy(folder, dst) moved += 1 pbar.update(1) # Remove parent folder if exists and empty if exist(parent(folder)) and not os.listdir(parent(folder)): os.rmdir(parent(folder)) removed_empty += 1 # Remove parent folder if exists and empty if exist(parent(parent(folder))) and not os.listdir(parent(parent(folder))): os.rmdir(parent(parent(folder))) removed_empty += 1 pbar.close() print("Moved %s folders, removed %s folders including %s empty ones" % (moved, removed, removed_empty)) self.instances.clear() self.contents.clear()
[docs] def missing_instances(self) -> dict: """ Return dict of instances where computed < n_instances. :return: result[graph][crawler][metric] -> missing count """ missing = {} for g in self.graph_full_names: missing[g] = {} for c in self.crawler_names: missing[g][c] = {} for m in self.metric_names: present = self.instances[g][c][m] if self.n_instances > present: missing[g][c][m] = self.n_instances - present if len(missing[g][c]) == 0: del missing[g][c] if len(missing[g]) == 0: del missing[g] # print(json.dumps(missing, indent=2)) return missing
[docs] def draw_by_crawler(self, x_lims=None, x_normalize=True, sharey=True, draw_error=True, draw_each_instance=False, scale=3, title="By crawler"): """ Draw M x G table of plots with C lines each, where M - num of metrics, G - num of graphs, C - num of crawlers. Ox - crawling step, Oy - metric value. :param x_lims: x-limits for plots. Overrides x_lims passed in constructor :param x_normalize: if True, x values are normalized to be from 0 to 1 :param draw_error: if True, fill standard deviation area around the averaged crawling curve :param draw_each_instance: if True, show each instance :param scale: size of plots (default 3) :param title: figure title """ x_lims = x_lims or self.x_lims G = len(self.graph_full_names) M = len(self.metric_names) nrows, ncols = M, G if M == 1: nrows = int(sqrt(G)) ncols = ceil(G / nrows) if G == 1: nrows = int(sqrt(M)) ncols = ceil(M / nrows) fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title, figsize=(1 + scale * ncols, scale * nrows)) total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names) pbar = tqdm(total=total, desc='Plotting by crawler') aix = 0 for i, m in enumerate(self.metric_names): for j, g in enumerate(self.graph_full_names): if nrows > 1 and ncols > 1: plt.sca(axs[aix // ncols, aix % ncols]) elif nrows * ncols > 1: plt.sca(axs[aix]) if aix % G == 0: plt.ylabel(self.labels[m]) if i == 0: plt.title(g[-1]) if aix // ncols == nrows-1: plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled') aix += 1 if x_lims: plt.xlim(x_lims) for k, c in enumerate(self.crawler_names): contents = self.contents[g][c][m] # Draw each instance if draw_each_instance: for inst in range(len(contents['ys'])): plt.plot(contents['x'], contents['ys'][inst], color=COLORS[k % len(COLORS)], linewidth=1, linestyle=':') # Draw variance xs = contents['x'] if x_normalize and len(xs) > 0: xs = xs / xs[-1] if len(xs) > 0 and draw_error: error = variance(contents['ys'], axis=0) ** 0.5 plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, color=COLORS[k % len(COLORS)], alpha=0.2) plt.plot(xs, contents['avy'], color=COLORS[k % len(COLORS)], linewidth=1, label="[%s] %s" % (self.instances[g][c][m], self.labels[c])) pbar.update(1) pbar.close() plt.legend() plt.tight_layout()
[docs] def draw_by_metric(self, x_lims=None, x_normalize=True, sharey=True, draw_error=True, scale=3, title="By metric"): """ Draw C x G table of plots with M lines each, where M - num of metrics, G - num of graphs, C - num of crawlers Ox - crawling step, Oy - metric value. """ x_lims = x_lims or self.x_lims G = len(self.graph_full_names) C = len(self.crawler_names) nrows, ncols = C, G if C == 1: nrows = int(sqrt(G)) ncols = ceil(G / nrows) if G == 1: nrows = int(sqrt(C)) ncols = ceil(C / nrows) fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title, figsize=(1 + scale * ncols, scale * nrows)) total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names) pbar = tqdm(total=total, desc='Plotting by crawler') aix = 0 for i, c in enumerate(self.crawler_names): for j, g in enumerate(self.graph_full_names): if nrows > 1 and ncols > 1: plt.sca(axs[aix // ncols, aix % ncols]) elif nrows * ncols > 1: plt.sca(axs[aix]) if aix % G == 0: plt.ylabel(self.labels[c]) if i == 0: plt.title(g[-1]) if aix // ncols == nrows-1: plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled') aix += 1 if x_lims: plt.xlim(x_lims) for k, m in enumerate(self.metric_names): contents = self.contents[g][c][m] # Draw each instance # for inst in range(len(contents['ys'])): # plt.plot(contents['x'], contents['ys'][inst], color=colors[k % len(colors)], linewidth=0.5, linestyle=':') # Draw variance xs = contents['x'] if x_normalize and len(xs) > 0: xs = xs / xs[-1] if len(xs) > 0 and draw_error: error = variance(contents['ys'], axis=0) ** 0.5 plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, color=COLORS[k % len(COLORS)], alpha=0.2) plt.plot(xs, contents['avy'], color=COLORS[k % len(COLORS)], linewidth=1, label="[%s] %s" % (self.instances[g][c][m], self.labels[m])) pbar.update(1) pbar.close() plt.legend() plt.tight_layout()
[docs] def draw_by_metric_crawler(self, x_lims=None, x_normalize=True, sharey=True, swap_coloring_scheme=False, draw_error=True, scale=3, title="By metric and crawler"): """ Draw G plots with CxM lines each, where M - num of metrics, G - num of graphs, C - num of crawlers. Ox - crawling step, Oy - metric value. :param x_lims: x-limits for plots. Overrides x_lims passed in constructor :param x_normalize: if True, x values are normalized to be from 0 to 1 :param sharey: if True, share properties among or y axes :param swap_coloring_scheme: by default metrics differ in linestyle, crawlers differ in color. Set True to swap :param draw_error: if True, fill standard deviation area around the averaged crawling curve :param scale: size of plots (default 3) :param title: figure title """ x_lims = x_lims or self.x_lims G = len(self.graph_full_names) nrows = int(sqrt(G)) ncols = ceil(G / nrows) fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title, figsize=(1 + scale * ncols, scale * nrows)) total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names) pbar = tqdm(total=total, desc='Plotting by metric crawler') aix = 0 for j, g in enumerate(self.graph_full_names): if nrows > 1 and ncols > 1: plt.sca(axs[aix // ncols, aix % ncols]) elif nrows * ncols > 1: plt.sca(axs[aix]) if aix % ncols == 0: plt.ylabel('Metrics value') plt.title(g[-1]) if aix // ncols == nrows-1: plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled') aix += 1 if x_lims: plt.xlim(x_lims) for k, c in enumerate(self.crawler_names): for i, m in enumerate(self.metric_names): contents = self.contents[g][c][m] ls, col = (k, i) if swap_coloring_scheme else (i, k) # Draw variance xs = contents['x'] if x_normalize and len(xs) > 0: xs = xs / xs[-1] if len(xs) > 0 and draw_error: error = variance(contents['ys'], axis=0) ** 0.5 plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, alpha=0.2, color=COLORS[col % len(COLORS)]) plt.plot(xs, contents['avy'], linewidth=1, linestyle=LINESTYLES[ls % len(LINESTYLES)], color=COLORS[col % len(COLORS)], label="[%s] %s, %s" % (self.instances[g][c][m], self.labels[c], self.labels[m])) pbar.update(1) pbar.close() plt.legend() plt.tight_layout()
def _compute_aggregated(self, x_lims=None): """ :param x_lims: if specified as (x_from, x_to), compute AUCC for an interval containing the specified one """ x_lims = x_lims or self.x_lims if len(self.auccs) > 0: return # Compute AUCCs G = len(self.graph_full_names) C = len(self.crawler_names) M = len(self.metric_names) self.auccs.clear() pbar = tqdm(total=G*C*M, desc='Computing AUCCs') for g in self.graph_full_names: self.auccs[g] = {} for c in self.crawler_names: self.auccs[g][c] = {} for m in self.metric_names: self.auccs[g][c][m] = aucc = {} contents = self.contents[g][c][m] xs = contents['x'] ys = contents['ys'] i0 = 0 i1 = None if x_lims: x0, x1 = self.x_lims i0 = bisect_left(xs, x0) i1 = bisect_right(xs, x1) + 1 aucc['AUCC'] = [compute_aucc(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))] aucc['wAUCC'] = [compute_waucc(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))] aucc['TC'] = [compute_targets_crawled(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))] pbar.update(1) pbar.close()
[docs] def get_aggregated(self, aggregator='AUCC', x_lims=None, median=False, print_results=False): """ Get results according to an aggregatro (AUCC, wAUCC, TC) :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor :param median: if True, compute median instead of mean :param print_results: if True, print results :return: list of results as tuple (num_instances, Graph, Crawler, Metric, mean, error) """ assert aggregator in ['AUCC', 'wAUCC', 'TC'] x_lims = x_lims or self.x_lims self._compute_aggregated(x_lims=x_lims) results = [] for g in self.graph_full_names: for i, m in enumerate(self.metric_names): errors = [variance(self.auccs[g][c][m][aggregator]) ** 0.5 for c in self.crawler_names] avgs = [average(self.auccs[g][c][m][aggregator], median) for c in self.crawler_names] for ix, c in enumerate(self.crawler_names): results.append( (len(self.contents[g][c][m]['ys']), '/'.join(g), self.labels[c], self.labels[m], avgs[ix], errors[ix])) if print_results: for n, g, c, m, avg, err in results: string = "[%s] " % n + ', '.join([g, c, m, "%.1f+-%.1f" % (avg, err)]) print(string) return results
[docs] def draw_aggregated(self, aggregator='AUCC', x_lims=None, scale=3, sharey=True, boxplot=True, xticks_rotation=90, title=None, draw_count=True): """ Draw G plots with M lines. Ox - C crawlers, Oy - AUCC value (M curves with error bars). M - num of metrics, G - num of graphs, C - num of crawlers :param aggregator: function translating crawling curve into 1 number. AUCC (default) or wAUCC :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor :param scale: size of plots (default 3) :param sharey: if True, share properties among or y axes :param xticks_rotation: rotate x-ticks (default 90 degrees) :param title: figure title :param draw_count: if True, prepend number of instances to label """ assert aggregator in ['AUCC', 'wAUCC', 'TC'] x_lims = x_lims or self.x_lims self._compute_aggregated(x_lims=x_lims) G = len(self.graph_full_names) C = len(self.crawler_names) M = len(self.metric_names) if M > 1: boxplot = False # Draw nrows = int(sqrt(G)) ncols = ceil(G / nrows) fig, axs = plt.subplots(nrows, ncols, sharex=True, sharey=sharey, num=title, figsize=(1 + scale * ncols, 1 + scale * nrows)) aix = 0 pbar = tqdm(total=G*M, desc='Plotting %s' % aggregator) xs = list(range(1, 1 + C)) for g in self.graph_full_names: if nrows > 1 and ncols > 1: plt.sca(axs[aix // ncols, aix % ncols]) elif nrows * ncols > 1: plt.sca(axs[aix]) if aix == 0: plt.ylabel('%s value' % aggregator) plt.title(g[-1]) # for each crawler a list of instances for each metris labels = [[] for _ in self.crawler_names] for i, m in enumerate(self.metric_names): errors = [variance(self.auccs[g][c][m][aggregator]) ** 0.5 for c in self.crawler_names] ys = [self.auccs[g][c][m][aggregator] for c in self.crawler_names] means = [average(self.auccs[g][c][m][aggregator]) for c in self.crawler_names] # meds = [np.median(self.auccs[g][c][m][aggregator]) for c in self.crawler_names] if boxplot: box_plot = plt.boxplot(ys) for median in box_plot['medians']: median.set_color('red') else: plt.errorbar(xs, means, errors, label=self.labels[m], marker='.', capsize=5, color=COLORS[i % len(COLORS)]) for ix, c in enumerate(self.crawler_names): print(f"[{len(self.contents[g][c][m]['ys'])}]", g, self.labels[c], self.labels[m], "%.1f+-%.1f" % (means[ix], errors[ix])) for j, c in enumerate(self.crawler_names): labels[j].append(len(self.contents[g][c][m]['ys'])) pbar.update(1) labels = [(f"[{','.join(str(l) for l in ls)}] " if draw_count else "") + self.labels[c] for c, ls in zip(self.crawler_names, labels)] plt.xticks(xs, labels, rotation=xticks_rotation) aix += 1 pbar.close() if not boxplot: plt.legend() plt.tight_layout()
[docs] def draw_winners(self, aggregator='AUCC', x_lims=None, scale=8, xticks_rotation=90, title=None): """ Draw C stacked bars (each of M elements). Ox - C crawlers, Oy - number of wins (among G) by (w)AUCC value. Miss graphs where not all configurations are present. :param aggregator: function translating crawling curve into 1 number. AUCC (default) or wAUCC :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor :param scale: size of plots (default 8) :param xticks_rotation: rotate x-ticks (default 90 degrees) :param title: figure title """ assert aggregator in ['AUCC', 'wAUCC', 'TC'] x_lims = x_lims or self.x_lims self._compute_aggregated(x_lims=x_lims) G = len(self.graph_full_names) C = len(self.crawler_names) M = len(self.metric_names) # Computing winners winners = {} # winners[crawler][metric] -> count for c in self.crawler_names: winners[c] = {} for m in self.metric_names: winners[c][m] = 0 for m in self.metric_names: for g in self.graph_full_names: ca = [average(self.auccs[g][c][m][aggregator]) for c in self.crawler_names] if any(np.isnan(ca)): continue winner = self.crawler_names[np.argmax(ca)] winners[winner][m] += 1 # Draw plt.figure(num=title or "Winners by %s" % aggregator, figsize=(1 + scale, scale)) xs = list(range(1, 1 + C)) prev_bottom = np.zeros(C) for i, m in enumerate(self.metric_names): h = [winners[c][m] for c in self.crawler_names] plt.bar(xs, h, width=0.8, bottom=prev_bottom, color=COLORS[i % len(COLORS)], label=self.labels[m]) prev_bottom += h plt.ylabel('Wins by %s value' % aggregator) plt.xticks(xs, [self.labels[c] for c in self.crawler_names], rotation=xticks_rotation) plt.legend() plt.tight_layout()
[docs] def show_plots(self): """ Show drawn matplotlib plots """ plt.show()
[docs] @staticmethod def next_file(folder: Path): """ Return a path with a smallest number not present in the folder. E.g. if folder has 0.json and 2.json, it returns path for 1.json """ ix = 0 while True: path = folder / f"{ix}.json" if not path.exists(): # if name exists, adding number to it return path ix += 1
[docs] @staticmethod def merge_folders(*path, not_earlier_than=None, not_later_than=None, check_identical=False, copy=False): """ Merge all results into 1 folder: path[1], path[2], etc into path[0]. Name collisions resolved via assigning new smallest numbers, e.g. when 0.json is added to a folder with 0.json and 2.json, it becomes 1.json. Args: *path: list of paths each of those is analog to original results/ in terms of structure. not_earlier_than: look for files with modify datetime not earlier than specified. not_later_than: look for files with modify datetime not later than specified. check_identical: before renaming check whether equally named files are identical. copy: if True, copy all moved elements. """ if len(path) < 2: raise RuntimeError("Specify more than 1 paths to be merged") import filecmp if not_earlier_than is not None: not_earlier_than = not_earlier_than.timestamp() if not_later_than is not None: not_later_than = not_later_than.timestamp() def check_datetime(path: Path): """ Check modify time """ if not_earlier_than is not None: if path.stat().st_mtime < not_earlier_than: return False if not_later_than is not None: if path.stat().st_mtime > not_later_than: return False return True def merge(dst_path: Path, src_path: Path): src_content = os.listdir(src_path) for name in src_content: src_subpath = src_path / name dst_subpath = dst_path / name if src_subpath.is_file(): # file if check_datetime(src_subpath): if dst_subpath.exists(): # Rename if check_identical and filecmp.cmp(src_subpath, dst_subpath): # If files are the same, avoid duplication rname_move_dirmove_ident[3] += 1 if not copy: os.remove(src_subpath) else: new_path = ResultsMerger.next_file(dst_subpath.parent) (shutil.copy if copy else shutil.move)(src_subpath, new_path) rname_move_dirmove_ident[0] += 1 else: # just move dst_subpath.parent.mkdir(parents=True, exist_ok=True) (shutil.copy if copy else shutil.move)(src_subpath, dst_subpath) rname_move_dirmove_ident[1] += 1 else: # directory merge(dst_subpath, src_subpath) dst = Path(path[0]) for i in range(1, len(path)): print("Merging", path[i], "->", dst) rname_move_dirmove_ident = [0, 0, 0, 0] merge(dst, Path(path[i])) print(rname_move_dirmove_ident[0], "files renamed") print(rname_move_dirmove_ident[1], "files as is") print(rname_move_dirmove_ident[2], "directories as is") print(rname_move_dirmove_ident[3], "files coincide")