Source code for modules.mostFrequentRoutine

import utils.utils
import pandas
from datetime import datetime
from fuzzywuzzy import fuzz
from multiprocessing.queues import Queue
from deprecated.sphinx import deprecated


[docs]@deprecated(version='1.2.0', reason="Replaced by decision points") def selectMostFrequentCase(dataframe: pandas.DataFrame, status_queue: Queue, flattened=False, threshold=90): """ Select the most frequent routine in the process by using levenhstein distance to calculate similarity between strings :param dataframe: low level pandas dataframe of process :param status_queue: queue to print messages in GUI :param flattened: :param threshold: threshold of similarity, traces are considered similar if they are equal by at least 90% :return: most frequent trace """ df = dataframe if df.empty: return None # flattening df['browser_url_hostname'] = df['browser_url'].apply(lambda url: utils.utils.getHostname(url)).fillna('') df['flattened'] = df[ ['concept:name', 'category', 'browser_url_hostname']].agg(','.join, axis=1) groupby_column = 'flattened' if flattened else 'concept:name' # Merge rows of each trace into one row, so the resulting dataframe has n rows where n is the number of traces # For example I get # case:concept:name concept:name timestamp # 0 Create Fine, Send Fine 2020-03-20 17:09:06:308, 2020-03-20 17:09:06:3 # 1 Insert Fine Notification, Add penalty 2020-03-20 17:10:28:348, 2020-03-20 17:10:28:2 df1 = df.groupby(['case:concept:name'])[[groupby_column, 'time:timestamp']].agg(', '.join).reset_index() def getDuration(time): """ Get duration of a trace, taking the first and last timestam in the trace and calculating the difference :param time: timestamp column :return: time duration in seconds """ timestamps = time.split(',') try: start = datetime.fromisoformat(timestamps[0].strip()) finish = datetime.fromisoformat(timestamps[-1].strip()) # start = datetime.strptime(timestamps[0].strip(), "%Y-%m-%dT%H:%M:%S.%f") # finish = datetime.strptime(timestamps[-1].strip(), "%Y-%m-%dT%H:%M:%S.%f") except ValueError: start = datetime.strptime(timestamps[0].strip(), "%Y-%m-%d %H:%M:%S:%f") finish = datetime.strptime(timestamps[-1].strip(), "%Y-%m-%d %H:%M:%S:%f") duration = finish - start return duration.total_seconds() df1['duration'] = df1['time:timestamp'].apply(lambda time: getDuration(time)) # calculate variants, grouping the previous dataframe if there are equal rows # concept:name variants duration # typed, clickTextField, changeField, mouseClick... [0, 1] [25.123, 26.342] # typed, changeField, mouseClick, formSubmit, li... [2] [22.324] df2 = df1.groupby([groupby_column], sort=False)[['case:concept:name', 'duration']].agg( list).reset_index().rename(columns={"case:concept:name": "variants"}) def _findVariantWithShortestDuration(df1: pandas.DataFrame, most_frequent_variants, equal=False): """ Find the trace with the minimum duration in seconds. Not used when all traces are different :param df1: dataframe of process :param most_frequent_variants: case ids of most frequent traces :param equal: :return: concept:case:id of the variant with shortest duration """ # there are at least 2 equal variants, most_frequent_variants is an array like [0,1] # take only the most frequent rows in dataframe, like [0,1] if equal: most_frequent_variants_df = df1.loc[df1['case:concept:name'].isin(most_frequent_variants)] else: most_frequent_variants_df = df1.iloc[most_frequent_variants, :] # find the row with the smallest duration durations = most_frequent_variants_df['duration'].tolist() # return the index of the row with the smallest duration min_duration_trace = most_frequent_variants_df.loc[most_frequent_variants_df['duration'] == min(durations)][ 'case:concept:name'].tolist()[0] return min_duration_trace, min(durations) def _findMostFrequentTraces(df2: pandas.DataFrame, most_frequent_variants): """ Find the most frequent trace :param df2: pandas dataframe :param most_frequent_variants: :return: case:concept:name of most frequent traces """ try: # list composed by the first column (case:concept:name) of the most frequent rows # (selected by row index, because most_frequent_variants is a list of indices) most_frequent_traces = df2.iloc[most_frequent_variants, 1].values.tolist() # find the longest sublist of case:concept:name max_most_frequent_traces = max(most_frequent_traces, key=len) # if all the sublist have 1 element, I'm in case 2 if len(max_most_frequent_traces) == 1: return list(map(lambda a: a[0], most_frequent_traces)) # flattened list # else there is a sublist with more element, case 3 where there are equal traces else: return max_most_frequent_traces except Exception: return most_frequent_variants # get variants as list, each item represents a trace in the log # [[0, 1], [2]] variants = df2['variants'].tolist() # longest variant is selected because it's the most frequent # [0, 1] most_frequent_variants = max(variants, key=len) if len(most_frequent_variants) == 1: # all variants are different, I need to check similarities or find the one with the # shortest duration in the whole dataset # Check similarities between all the strings in the log and return the most frequent one # I don't need to check similarities in the other case, because there the strings are exactly the same def func(name): matches = df2.apply(lambda row: (fuzz.ratio(row[groupby_column], name) >= threshold), axis=1) return [i for i, x in enumerate(matches) if x] df3 = df2.apply(lambda row: func(row[groupby_column]), axis=1) # axis=1 means apply function to each row most_frequent_variants = max(df3.tolist(), key=len) if len(most_frequent_variants) == 1: # there are no similar strings, all are different, so I find the one with the smallest duration # in the whole dataset, I don't need to filter like in the other cases #  get all durations as list durations = df1['duration'].tolist() #  find smallest duration and select row in dataframe with that duration min_duration_trace = df1.loc[df1['duration'] == min(durations)]['case:concept:name'].tolist()[0] if len(variants) == 1: status_queue.put( f"[PROCESS MINING] There is only 1 trace with duration: {min(durations)} sec") else: status_queue.put( f"[PROCESS MINING] All {len(variants)} variants are different, " f"case {min_duration_trace} is the shortest ({min(durations)} sec)") else: # some strings are similar, it should be like case below min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants) most_frequent_traces = _findMostFrequentTraces(df2, most_frequent_variants) status_queue.put( f"[PROCESS MINING] There are {len(variants)} variants, " f"among the {len(most_frequent_traces)} similar traces, " f"case {min_duration_trace} is the shortest ({duration} sec)") print(f"[PROCESS MINING] Traces {most_frequent_traces} are similar by at least {threshold}%") else: # min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants) # most_frequent_traces = _findMostFrequentTraces(df2, most_frequent_variants) # self.status_queue.put( # f"[PROCESS MINING] There are {len(variants)} variants, " # f"among the {len(most_frequent_traces)} equal traces, " # f"case {min_duration_trace} is the shortest ({duration} sec)") # print(f"[PROCESS MINING] Traces {most_frequent_traces} are equal") min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants, equal=True) status_queue.put( f"[PROCESS MINING] There are {len(df1)} traces and {len(variants)} variants, " f"among the {len(most_frequent_variants)} equal traces, " f"case {min_duration_trace} is the shortest ({duration} sec)") print(f"[PROCESS MINING] Traces {most_frequent_variants} are equal") case = df.loc[df['case:concept:name'] == min_duration_trace] # self.selected_trace = min_duration_trace return case