Source code for running.merger

import glob
import json
import logging
import os
import re
import shutil
from bisect import bisect_left, bisect_right
from math import sqrt, ceil
from pathlib import Path

import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from crawlers.cbasic import Crawler
from crawlers.declarable import declaration_to_filename
from running.metrics import Metric
from utils import RESULT_DIR


def compute_aucc(xs, ys):
    # from sklearn.metrics import auc
    # return auc(xs, ys)
    assert len(xs) == len(ys) > 0
    xs = xs / xs[-1]
    res = xs[0] * ys[0] / 2
    for i in range(1, len(xs)):
        res += (xs[i] - xs[i-1]) * (ys[i-1] + ys[i]) / 2
    return res


def compute_waucc(xs, ys):
    # res = compute_aucc(np.log(xs), ys)
    assert len(xs) == len(ys) > 0
    xs = xs / xs[-1]
    res = 0 if xs[0] == 0 else ys[0]
    norm = 0 if xs[0] == 0 else 1
    for i in range(1, len(xs)):
        res += (xs[i] - xs[i-1]) * (ys[i-1] + ys[i]) / 2 / xs[i]
        norm += (xs[i] - xs[i-1]) / xs[i]
    return res / norm


def compute_targets_crawled(xs, ys):
    assert len(xs) == len(ys) > 0
    return ys[-1]


def average(array, median=False, **kwargs):
    if len(array) == 0:
        return np.nan
    return (np.median if median else np.mean)(array, **kwargs)


def variance(array, **kwargs):
    if len(array) == 0:
        return np.nan
    return np.var(array, **kwargs)


LINESTYLES = ['-', '--', ':', '-.']
COLORS = ['black', 'b', 'g', 'r', 'c', 'm', 'y',
          'darkblue', 'darkgreen', 'darkred', 'darkmagenta', 'darkorange', 'darkcyan',
          'pink', 'lime', 'wheat', 'lightsteelblue']


[docs]class ResultsMerger:
    """
    ResultsMerger can aggregate and plot results saved in files.
    Process all combinations of G graphs x C crawlers x M metrics. Averages over n instances of each.
    All missed instances are just ignored.

    Plotting functions:

    * draw_by_crawler - Draw M x G table of plots with C lines each. Ox - crawling step, Oy - metric value.
    * draw_by_metric_crawler - Draw G plots with C x M lines each. Ox - crawling step, Oy - metric value.
    * draw_by_metric - Draw C x G table of plots with M lines each. Ox - crawling step, Oy - metric value.
    * draw_aggregated - Draw G plots with M lines. Ox - C crawlers, Oy - (w)AUCC value (M curves with error bars).
    * draw_winners - Draw C stacked bars (each of M elements). Ox - C crawlers, Oy - number of wins (among G) by (w)AUCC
      value.

    Additional functions:

    * missing_instances - Calculate how many instances of all configurations are missing.
    * move_folders - Move/remove/copy saved instances for current graphs, crawlers, metrics.

    NOTES:

    * x values must be the same for all files and are the ones generated by `exponential_batch_generator()` from
      running/runner.py
    * it is supposed that for all instances values lists are of equal lengthes (i.e. budgets). Otherwise normalisation
      and aggregation may fail. If so, use `x_lims` parameter for the control.

    """
[docs]    def __init__(self, graph_full_names, crawler_decls, metric_decls, budget,
                 n_instances=None, x_lims=None,
                 result_dir=RESULT_DIR, numeric_only=True):
        """
        :param graph_full_names: list of graphs full names.
        :param crawler_decls: list of crawlers declarations.
        :param metric_decls: list of metrics declarations. Non-numeric metrics will be ignored.
        :param budget: results with this budget will be taken.
        :param n_instances: number of instances to average over, None for all.
        :param x_lims: use only specified x-limits for all plots unless another value is specified
         in plotting function.
        :param result_dir: specify if want to use non-default directory where results are stored.
        """
        self.graph_full_names = graph_full_names
        self.crawler_names = []  # list(map(declaration_to_filename, crawler_decls))
        self.metric_names = []  # list(map(declaration_to_filename, metric_decls))
        self.labels = {}  # pretty short names to draw in plots

        # Generate pretty names for crawlers and metrics for plots
        for md in metric_decls:
            m = Metric.from_declaration(md, graph=None)
            if numeric_only and not m.is_numeric:
                # Ignore non-numeric metrics
                continue
            f = declaration_to_filename(m.declaration)
            self.metric_names.append(f)
            self.labels[f] = m.name
        for cd in crawler_decls:
            c = Crawler.from_declaration(cd, graph=None)
            f = declaration_to_filename(c.declaration)
            self.crawler_names.append(f)
            self.labels[f] = c.name

        self.budget = budget
        self.n_instances = n_instances
        self.x_lims = x_lims
        self.instances = {}  # instances[graph][crawler][metric] -> count of instances
        # contents[graph][crawler][metric]:
        # 'x' -> [nums of steps],
        # 'ys' -> [[y for each step] for each instance],
        # 'avy' -> [avg y for each step]
        self.contents = {}
        # auccs[graph][crawler][metric]:
        # 'AUCC' -> [AUCC for each instance],
        # 'wAUCC' -> [wAUCC for each instance]
        self.auccs = {}

        self.result_dir = result_dir
        self._read()
        plt.style.use('seaborn')

[docs]    @staticmethod
    def names_to_path(graph_full_name: tuple, crawler_name: str, metric_name: str, budget: int,
                      result_dir=RESULT_DIR):
        """ Returns file pattern e.g.
        '/home/misha/workspace/crawling/results/ego-gplus/POD(batch=1)/TopK(centrality=BtwDistr,measure=Re,part=crawled,top=0.01)/\*.json'
        """
        # TODO apply
        # path = Path(
        #     result_dir, *graph_full_name, crawler_name, metric_name, f"budget={budget}", "*.json")
        path = Path(result_dir, *graph_full_name, crawler_name, metric_name, "*.json")
        return path

    def _read(self):
        total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names)
        pbar = tqdm(total=total, desc='Reading history')
        self.instances.clear()
        # self.contents.clear()
        for g in self.graph_full_names:
            self.instances[g] = {}
            self.contents[g] = {}
            for c in self.crawler_names:
                self.instances[g][c] = {}
                self.contents[g][c] = {}
                for m in self.metric_names:
                    # TODO apply
                    # path = ResultsMerger.names_to_path(g, c, m, self.budget, self.result_dir)
                    # fn_pattern = re.compile(f'(\d+)\.json')
                    # paths = []
                    # for file in path.parent.iterdir():
                    #     m = re.findall(fn_pattern, file.name)
                    #     if m:
                    #         print(file.name, m[0][0])
                    #         paths.append(file)
                    # paths = sorted(paths)[:self.n_instances]

                    path_pattern = ResultsMerger.names_to_path(g, c, m, self.result_dir)
                    # FIXME workaround for glob since '[' is a special symbol for it
                    path_pattern = str(path_pattern).replace('[', '[[]')
                    paths = glob.glob(path_pattern)[:self.n_instances]
                    paths = sorted(paths)

                    self.instances[g][c][m] = len(paths)
                    self.contents[g][c][m] = contents = {}

                    count = len(paths)
                    contents['x'] = []
                    contents['ys'] = ys = [[]]*count
                    contents['avy'] = []

                    i0 = 0
                    i1 = None
                    for inst, p in enumerate(paths):
                        with open(p, 'r') as f:
                            imported = json.load(f)
                        if len(contents['x']) == 0:
                            xs = np.array(sorted([int(x) for x in list(imported.keys())]))[i0: i1]
                            if self.x_lims:  # Cut over x_lims
                                x0, x1 = self.x_lims
                                i0 = bisect_left(xs, x0)
                                i1 = bisect_right(xs, x1)+1
                            contents['x'] = xs
                        if inst == 0:
                            contents['avy'] = np.zeros(len(xs))
                        try:
                            # Convert to float and compute average if possible
                            ys[inst] = np.array([float(x) for x in list(imported.values())])[i0: i1]
                            contents['avy'] += np.array(ys[inst]) / count
                        except TypeError:
                            # Non-numeric values - as is
                            ys[inst] = np.array(list(imported.values()))[i0: i1]

                    pbar.update(1)
        pbar.close()

[docs]    def move_folders(self, path_from=None, path_to=None, copy=False):
        """ Move/remove/copy all saved instances for current [graphs X crawlers X metrics].
        Specify `path_to` parameter to move files instead of removing.

        :param path_from: this folder is root for all folders to be (re)moved,
         must be contained in path to folders
        :param path_to: this folder is the destination for all folders to be moved.
         If None (which is default), all folders will be removed.
        :param copy: set to True if want to copy folders
        """
        if path_from is None:
            path_from = self.result_dir
        path_from = str(path_from)
        path_to = str(path_to)
        move_or_copy = shutil.copytree if copy else shutil.move
        total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names)
        pbar = tqdm(total=total, desc='(Re)moving history')
        folder = None
        removed = 0
        removed_empty = 0
        moved = 0
        from os.path import dirname as parent
        from os.path import exists as exist
        for g in self.graph_full_names:
            for c in self.crawler_names:
                for m in self.metric_names:
                    folder = str(ResultsMerger.names_to_path(g, c, m, self.budget, self.result_dir).parent)
                    if not exist(folder):
                        continue
                    if path_to is None:  # remove
                        shutil.rmtree(folder, ignore_errors=True)
                        removed += 1
                    else:  # move
                        assert path_from in folder
                        dst = folder.replace(path_from, path_to)
                        move_or_copy(folder, dst)
                        moved += 1
                    pbar.update(1)

                # Remove parent folder if exists and empty
                if exist(parent(folder)) and not os.listdir(parent(folder)):
                    os.rmdir(parent(folder))
                    removed_empty += 1

            # Remove parent folder if exists and empty
            if exist(parent(parent(folder))) and not os.listdir(parent(parent(folder))):
                os.rmdir(parent(parent(folder)))
                removed_empty += 1
        pbar.close()
        print("Moved %s folders, removed %s folders including %s empty ones" %
              (moved, removed, removed_empty))
        self.instances.clear()
        self.contents.clear()

[docs]    def missing_instances(self) -> dict:
        """ Return dict of instances where computed < n_instances.

        :return: result[graph][crawler][metric] -> missing count
        """
        missing = {}
        for g in self.graph_full_names:
            missing[g] = {}
            for c in self.crawler_names:
                missing[g][c] = {}
                for m in self.metric_names:
                    present = self.instances[g][c][m]
                    if self.n_instances > present:
                        missing[g][c][m] = self.n_instances - present

                if len(missing[g][c]) == 0:
                    del missing[g][c]

            if len(missing[g]) == 0:
                del missing[g]

        # print(json.dumps(missing, indent=2))
        return missing

[docs]    def draw_by_crawler(self, x_lims=None, x_normalize=True, sharey=True, draw_error=True,
                        draw_each_instance=False, scale=3, title="By crawler"):
        """
        Draw M x G table of plots with C lines each, where
        M - num of metrics, G - num of graphs, C - num of crawlers.
        Ox - crawling step, Oy - metric value.

        :param x_lims: x-limits for plots. Overrides x_lims passed in constructor
        :param x_normalize: if True, x values are normalized to be from 0 to 1
        :param draw_error: if True, fill standard deviation area around the averaged crawling curve
        :param draw_each_instance: if True, show each instance
        :param scale: size of plots (default 3)
        :param title: figure title
        """
        x_lims = x_lims or self.x_lims

        G = len(self.graph_full_names)
        M = len(self.metric_names)
        nrows, ncols = M, G
        if M == 1:
            nrows = int(sqrt(G))
            ncols = ceil(G / nrows)
        if G == 1:
            nrows = int(sqrt(M))
            ncols = ceil(M / nrows)
        fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title, figsize=(1 + scale * ncols, scale * nrows))

        total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names)
        pbar = tqdm(total=total, desc='Plotting by crawler')
        aix = 0
        for i, m in enumerate(self.metric_names):
            for j, g in enumerate(self.graph_full_names):
                if nrows > 1 and ncols > 1:
                    plt.sca(axs[aix // ncols, aix % ncols])
                elif nrows * ncols > 1:
                    plt.sca(axs[aix])
                if aix % G == 0:
                    plt.ylabel(self.labels[m])
                if i == 0:
                    plt.title(g[-1])
                if aix // ncols == nrows-1:
                    plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled')
                aix += 1

                if x_lims:
                    plt.xlim(x_lims)
                for k, c in enumerate(self.crawler_names):
                    contents = self.contents[g][c][m]
                    # Draw each instance
                    if draw_each_instance:
                        for inst in range(len(contents['ys'])):
                            plt.plot(contents['x'], contents['ys'][inst], color=COLORS[k % len(COLORS)], linewidth=1, linestyle=':')
                    # Draw variance
                    xs = contents['x']
                    if x_normalize and len(xs) > 0:
                        xs = xs / xs[-1]
                    if len(xs) > 0 and draw_error:
                        error = variance(contents['ys'], axis=0) ** 0.5
                        plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, color=COLORS[k % len(COLORS)], alpha=0.2)
                    plt.plot(xs, contents['avy'], color=COLORS[k % len(COLORS)], linewidth=1,
                             label="[%s] %s" % (self.instances[g][c][m], self.labels[c]))

                    pbar.update(1)
        pbar.close()
        plt.legend()
        plt.tight_layout()

[docs]    def draw_by_metric(self, x_lims=None, x_normalize=True, sharey=True, draw_error=True, scale=3,
                       title="By metric"):
        """
        Draw C x G table of plots with M lines each, where
        M - num of metrics, G - num of graphs, C - num of crawlers
        Ox - crawling step, Oy - metric value.
        """
        x_lims = x_lims or self.x_lims

        G = len(self.graph_full_names)
        C = len(self.crawler_names)
        nrows, ncols = C, G
        if C == 1:
            nrows = int(sqrt(G))
            ncols = ceil(G / nrows)
        if G == 1:
            nrows = int(sqrt(C))
            ncols = ceil(C / nrows)
        fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title, figsize=(1 + scale * ncols, scale * nrows))

        total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names)
        pbar = tqdm(total=total, desc='Plotting by crawler')
        aix = 0
        for i, c in enumerate(self.crawler_names):
            for j, g in enumerate(self.graph_full_names):
                if nrows > 1 and ncols > 1:
                    plt.sca(axs[aix // ncols, aix % ncols])
                elif nrows * ncols > 1:
                    plt.sca(axs[aix])
                if aix % G == 0:
                    plt.ylabel(self.labels[c])
                if i == 0:
                    plt.title(g[-1])
                if aix // ncols == nrows-1:
                    plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled')
                aix += 1

                if x_lims:
                    plt.xlim(x_lims)
                for k, m in enumerate(self.metric_names):
                    contents = self.contents[g][c][m]
                    # Draw each instance
                    # for inst in range(len(contents['ys'])):
                    #     plt.plot(contents['x'], contents['ys'][inst], color=colors[k % len(colors)], linewidth=0.5, linestyle=':')
                    # Draw variance
                    xs = contents['x']
                    if x_normalize and len(xs) > 0:
                        xs = xs / xs[-1]
                    if len(xs) > 0 and draw_error:
                        error = variance(contents['ys'], axis=0) ** 0.5
                        plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, color=COLORS[k % len(COLORS)], alpha=0.2)
                    plt.plot(xs, contents['avy'], color=COLORS[k % len(COLORS)], linewidth=1,
                             label="[%s] %s" % (self.instances[g][c][m], self.labels[m]))
                    pbar.update(1)
        pbar.close()
        plt.legend()
        plt.tight_layout()

[docs]    def draw_by_metric_crawler(self, x_lims=None, x_normalize=True, sharey=True,
                               swap_coloring_scheme=False, draw_error=True, scale=3,
                               title="By metric and crawler"):
        """
        Draw G plots with CxM lines each, where
        M - num of metrics, G - num of graphs, C - num of crawlers.
        Ox - crawling step, Oy - metric value.

        :param x_lims: x-limits for plots. Overrides x_lims passed in constructor
        :param x_normalize: if True, x values are normalized to be from 0 to 1
        :param sharey: if True, share properties among or y axes
        :param swap_coloring_scheme: by default metrics differ in linestyle, crawlers differ in color. Set True to swap
        :param draw_error: if True, fill standard deviation area around the averaged crawling curve
        :param scale: size of plots (default 3)
        :param title: figure title
        """
        x_lims = x_lims or self.x_lims

        G = len(self.graph_full_names)
        nrows = int(sqrt(G))
        ncols = ceil(G / nrows)
        fig, axs = plt.subplots(nrows, ncols, sharex=x_normalize, sharey=sharey, num=title,
                                figsize=(1 + scale * ncols, scale * nrows))

        total = len(self.graph_full_names) * len(self.crawler_names) * len(self.metric_names)
        pbar = tqdm(total=total, desc='Plotting by metric crawler')
        aix = 0
        for j, g in enumerate(self.graph_full_names):
            if nrows > 1 and ncols > 1:
                plt.sca(axs[aix // ncols, aix % ncols])
            elif nrows * ncols > 1:
                plt.sca(axs[aix])
            if aix % ncols == 0:
                plt.ylabel('Metrics value')
            plt.title(g[-1])
            if aix // ncols == nrows-1:
                plt.xlabel('Nodes fraction crawled' if x_normalize else 'Nodes crawled')
            aix += 1

            if x_lims:
                plt.xlim(x_lims)
            for k, c in enumerate(self.crawler_names):
                for i, m in enumerate(self.metric_names):
                    contents = self.contents[g][c][m]
                    ls, col = (k, i) if swap_coloring_scheme else (i, k)
                    # Draw variance
                    xs = contents['x']
                    if x_normalize and len(xs) > 0:
                        xs = xs / xs[-1]
                    if len(xs) > 0 and draw_error:
                        error = variance(contents['ys'], axis=0) ** 0.5
                        plt.fill_between(xs, contents['avy'] - error, contents['avy'] + error, alpha=0.2,
                                         color=COLORS[col % len(COLORS)])
                    plt.plot(xs, contents['avy'], linewidth=1,
                             linestyle=LINESTYLES[ls % len(LINESTYLES)],
                             color=COLORS[col % len(COLORS)],
                             label="[%s] %s, %s" % (self.instances[g][c][m], self.labels[c], self.labels[m]))
                    pbar.update(1)
        pbar.close()
        plt.legend()
        plt.tight_layout()

    def _compute_aggregated(self, x_lims=None):
        """
        :param x_lims: if specified as (x_from, x_to), compute AUCC for an interval containing the specified one
        """
        x_lims = x_lims or self.x_lims
        if len(self.auccs) > 0:
            return
        # Compute AUCCs
        G = len(self.graph_full_names)
        C = len(self.crawler_names)
        M = len(self.metric_names)
        self.auccs.clear()
        pbar = tqdm(total=G*C*M, desc='Computing AUCCs')
        for g in self.graph_full_names:
            self.auccs[g] = {}
            for c in self.crawler_names:
                self.auccs[g][c] = {}
                for m in self.metric_names:
                    self.auccs[g][c][m] = aucc = {}
                    contents = self.contents[g][c][m]
                    xs = contents['x']
                    ys = contents['ys']
                    i0 = 0
                    i1 = None
                    if x_lims:
                        x0, x1 = self.x_lims
                        i0 = bisect_left(xs, x0)
                        i1 = bisect_right(xs, x1) + 1

                    aucc['AUCC'] = [compute_aucc(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))]
                    aucc['wAUCC'] = [compute_waucc(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))]
                    aucc['TC'] = [compute_targets_crawled(xs[i0: i1], ys[inst][i0: i1]) for inst in range(len(ys))]
                    pbar.update(1)
        pbar.close()

[docs]    def get_aggregated(self, aggregator='AUCC', x_lims=None, median=False, print_results=False):
        """ Get results according to an aggregatro (AUCC, wAUCC, TC)
        :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor
        :param median: if True, compute median instead of mean
        :param print_results: if True, print results
        :return: list of results as tuple (num_instances, Graph, Crawler, Metric, mean, error)
        """
        assert aggregator in ['AUCC', 'wAUCC', 'TC']
        x_lims = x_lims or self.x_lims
        self._compute_aggregated(x_lims=x_lims)
        results = []
        for g in self.graph_full_names:
            for i, m in enumerate(self.metric_names):
                errors = [variance(self.auccs[g][c][m][aggregator]) ** 0.5 for c in self.crawler_names]
                avgs = [average(self.auccs[g][c][m][aggregator], median) for c in self.crawler_names]
                for ix, c in enumerate(self.crawler_names):
                    results.append(
                        (len(self.contents[g][c][m]['ys']),
                         '/'.join(g),
                         self.labels[c],
                         self.labels[m], avgs[ix], errors[ix]))

        if print_results:
            for n, g, c, m, avg, err in results:
                string = "[%s] " % n + ', '.join([g, c, m, "%.1f+-%.1f" % (avg, err)])
                print(string)

        return results

[docs]    def draw_aggregated(self, aggregator='AUCC', x_lims=None, scale=3, sharey=True,
                        boxplot=True, xticks_rotation=90, title=None, draw_count=True):
        """
        Draw G plots with M lines. Ox - C crawlers, Oy - AUCC value (M curves with error bars).
        M - num of metrics, G - num of graphs, C - num of crawlers

        :param aggregator: function translating crawling curve into 1 number. AUCC (default) or wAUCC
        :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor
        :param scale: size of plots (default 3)
        :param sharey: if True, share properties among or y axes
        :param xticks_rotation: rotate x-ticks (default 90 degrees)
        :param title: figure title
        :param draw_count: if True, prepend number of instances to label
        """
        assert aggregator in ['AUCC', 'wAUCC', 'TC']
        x_lims = x_lims or self.x_lims

        self._compute_aggregated(x_lims=x_lims)
        G = len(self.graph_full_names)
        C = len(self.crawler_names)
        M = len(self.metric_names)
        if M > 1:
            boxplot = False

        # Draw
        nrows = int(sqrt(G))
        ncols = ceil(G / nrows)
        fig, axs = plt.subplots(nrows, ncols, sharex=True, sharey=sharey, num=title,
                                figsize=(1 + scale * ncols, 1 + scale * nrows))
        aix = 0
        pbar = tqdm(total=G*M, desc='Plotting %s' % aggregator)
        xs = list(range(1, 1 + C))
        for g in self.graph_full_names:
            if nrows > 1 and ncols > 1:
                plt.sca(axs[aix // ncols, aix % ncols])
            elif nrows * ncols > 1:
                plt.sca(axs[aix])
            if aix == 0:
                plt.ylabel('%s value' % aggregator)
            plt.title(g[-1])

            # for each crawler a list of instances for each metris
            labels = [[] for _ in self.crawler_names]
            for i, m in enumerate(self.metric_names):
                errors = [variance(self.auccs[g][c][m][aggregator]) ** 0.5 for c in self.crawler_names]

                ys = [self.auccs[g][c][m][aggregator] for c in self.crawler_names]
                means = [average(self.auccs[g][c][m][aggregator]) for c in self.crawler_names]
                # meds = [np.median(self.auccs[g][c][m][aggregator]) for c in self.crawler_names]
                if boxplot:
                    box_plot = plt.boxplot(ys)
                    for median in box_plot['medians']:
                        median.set_color('red')
                else:
                    plt.errorbar(xs, means, errors, label=self.labels[m], marker='.', capsize=5,
                                 color=COLORS[i % len(COLORS)])

                for ix, c in enumerate(self.crawler_names):
                    print(f"[{len(self.contents[g][c][m]['ys'])}]", g, self.labels[c],
                          self.labels[m], "%.1f+-%.1f" % (means[ix], errors[ix]))
                for j, c in enumerate(self.crawler_names):
                    labels[j].append(len(self.contents[g][c][m]['ys']))
                pbar.update(1)
            labels = [(f"[{','.join(str(l) for l in ls)}] " if draw_count else "") + self.labels[c]
                      for c, ls in zip(self.crawler_names, labels)]
            plt.xticks(xs, labels, rotation=xticks_rotation)
            aix += 1
        pbar.close()
        if not boxplot:
            plt.legend()
        plt.tight_layout()

[docs]    def draw_winners(self, aggregator='AUCC', x_lims=None, scale=8, xticks_rotation=90, title=None):
        """
        Draw C stacked bars (each of M elements). Ox - C crawlers, Oy - number of wins (among G) by (w)AUCC value.
        Miss graphs where not all configurations are present.

        :param aggregator: function translating crawling curve into 1 number. AUCC (default) or wAUCC
        :param x_lims: x-limits passed to aggregator. Overrides x_lims passed in constructor
        :param scale: size of plots (default 8)
        :param xticks_rotation: rotate x-ticks (default 90 degrees)
        :param title: figure title
        """
        assert aggregator in ['AUCC', 'wAUCC', 'TC']
        x_lims = x_lims or self.x_lims

        self._compute_aggregated(x_lims=x_lims)
        G = len(self.graph_full_names)
        C = len(self.crawler_names)
        M = len(self.metric_names)

        # Computing winners
        winners = {}  # winners[crawler][metric] -> count
        for c in self.crawler_names:
            winners[c] = {}
            for m in self.metric_names:
                winners[c][m] = 0

        for m in self.metric_names:
            for g in self.graph_full_names:
                ca = [average(self.auccs[g][c][m][aggregator]) for c in self.crawler_names]
                if any(np.isnan(ca)):
                    continue
                winner = self.crawler_names[np.argmax(ca)]
                winners[winner][m] += 1

        # Draw
        plt.figure(num=title or "Winners by %s" % aggregator, figsize=(1 + scale, scale))
        xs = list(range(1, 1 + C))
        prev_bottom = np.zeros(C)
        for i, m in enumerate(self.metric_names):
            h = [winners[c][m] for c in self.crawler_names]
            plt.bar(xs, h, width=0.8, bottom=prev_bottom, color=COLORS[i % len(COLORS)], label=self.labels[m])
            prev_bottom += h

        plt.ylabel('Wins by %s value' % aggregator)
        plt.xticks(xs, [self.labels[c] for c in self.crawler_names], rotation=xticks_rotation)
        plt.legend()
        plt.tight_layout()

[docs]    def show_plots(self):
        """ Show drawn matplotlib plots """
        plt.show()

[docs]    @staticmethod
    def next_file(folder: Path):
        """ Return a path with a smallest number not present in the folder.
        E.g. if folder has 0.json and 2.json, it returns path for 1.json
        """
        ix = 0
        while True:
            path = folder / f"{ix}.json"
            if not path.exists():  # if name exists, adding number to it
                return path
            ix += 1

[docs]    @staticmethod
    def merge_folders(*path, not_earlier_than=None, not_later_than=None, check_identical=False,
                      copy=False):
        """ Merge all results into 1 folder: path[1], path[2], etc into path[0].
        Name collisions resolved via assigning new smallest numbers, e.g. when 0.json is added to a
        folder with 0.json and 2.json, it becomes 1.json.

        Args:
            *path: list of paths each of those is analog to original results/ in terms of structure.
            not_earlier_than: look for files with modify datetime not earlier than specified.
            not_later_than: look for files with modify datetime not later than specified.
            check_identical: before renaming check whether equally named files are identical.
            copy: if True, copy all moved elements.
        """
        if len(path) < 2:
            raise RuntimeError("Specify more than 1 paths to be merged")

        import filecmp

        if not_earlier_than is not None:
            not_earlier_than = not_earlier_than.timestamp()

        if not_later_than is not None:
            not_later_than = not_later_than.timestamp()

        def check_datetime(path: Path):
            """ Check modify time """
            if not_earlier_than is not None:
                if path.stat().st_mtime < not_earlier_than:
                    return False
            if not_later_than is not None:
                if path.stat().st_mtime > not_later_than:
                    return False
            return True

        def merge(dst_path: Path, src_path: Path):
            src_content = os.listdir(src_path)
            for name in src_content:
                src_subpath = src_path / name
                dst_subpath = dst_path / name
                if src_subpath.is_file():  # file
                    if check_datetime(src_subpath):
                        if dst_subpath.exists():  # Rename
                            if check_identical and filecmp.cmp(src_subpath, dst_subpath):
                                # If files are the same, avoid duplication
                                rname_move_dirmove_ident[3] += 1
                                if not copy:
                                    os.remove(src_subpath)
                            else:
                                new_path = ResultsMerger.next_file(dst_subpath.parent)
                                (shutil.copy if copy else shutil.move)(src_subpath, new_path)
                                rname_move_dirmove_ident[0] += 1

                        else:  # just move
                            dst_subpath.parent.mkdir(parents=True, exist_ok=True)
                            (shutil.copy if copy else shutil.move)(src_subpath, dst_subpath)
                            rname_move_dirmove_ident[1] += 1

                else:  # directory
                    merge(dst_subpath, src_subpath)

        dst = Path(path[0])
        for i in range(1, len(path)):
            print("Merging", path[i], "->", dst)
            rname_move_dirmove_ident = [0, 0, 0, 0]
            merge(dst, Path(path[i]))
            print(rname_move_dirmove_ident[0], "files renamed")
            print(rname_move_dirmove_ident[1], "files as is")
            print(rname_move_dirmove_ident[2], "directories as is")
            print(rname_move_dirmove_ident[3], "files coincide")