import utils.utils
import pandas
from datetime import datetime
from fuzzywuzzy import fuzz
from multiprocessing.queues import Queue
from deprecated.sphinx import deprecated
[docs]@deprecated(version='1.2.0', reason="Replaced by decision points")
def selectMostFrequentCase(dataframe: pandas.DataFrame, status_queue: Queue, flattened=False, threshold=90):
"""
Select the most frequent routine in the process by using levenhstein distance to calculate similarity between strings
:param dataframe: low level pandas dataframe of process
:param status_queue: queue to print messages in GUI
:param flattened:
:param threshold: threshold of similarity, traces are considered similar if they are equal by at least 90%
:return: most frequent trace
"""
df = dataframe
if df.empty:
return None
# flattening
df['browser_url_hostname'] = df['browser_url'].apply(lambda url: utils.utils.getHostname(url)).fillna('')
df['flattened'] = df[
['concept:name', 'category', 'browser_url_hostname']].agg(','.join, axis=1)
groupby_column = 'flattened' if flattened else 'concept:name'
# Merge rows of each trace into one row, so the resulting dataframe has n rows where n is the number of traces
# For example I get
# case:concept:name concept:name timestamp
# 0 Create Fine, Send Fine 2020-03-20 17:09:06:308, 2020-03-20 17:09:06:3
# 1 Insert Fine Notification, Add penalty 2020-03-20 17:10:28:348, 2020-03-20 17:10:28:2
df1 = df.groupby(['case:concept:name'])[[groupby_column, 'time:timestamp']].agg(', '.join).reset_index()
def getDuration(time):
"""
Get duration of a trace, taking the first and last timestam in the trace and calculating the difference
:param time: timestamp column
:return: time duration in seconds
"""
timestamps = time.split(',')
try:
start = datetime.fromisoformat(timestamps[0].strip())
finish = datetime.fromisoformat(timestamps[-1].strip())
# start = datetime.strptime(timestamps[0].strip(), "%Y-%m-%dT%H:%M:%S.%f")
# finish = datetime.strptime(timestamps[-1].strip(), "%Y-%m-%dT%H:%M:%S.%f")
except ValueError:
start = datetime.strptime(timestamps[0].strip(), "%Y-%m-%d %H:%M:%S:%f")
finish = datetime.strptime(timestamps[-1].strip(), "%Y-%m-%d %H:%M:%S:%f")
duration = finish - start
return duration.total_seconds()
df1['duration'] = df1['time:timestamp'].apply(lambda time: getDuration(time))
# calculate variants, grouping the previous dataframe if there are equal rows
# concept:name variants duration
# typed, clickTextField, changeField, mouseClick... [0, 1] [25.123, 26.342]
# typed, changeField, mouseClick, formSubmit, li... [2] [22.324]
df2 = df1.groupby([groupby_column], sort=False)[['case:concept:name', 'duration']].agg(
list).reset_index().rename(columns={"case:concept:name": "variants"})
def _findVariantWithShortestDuration(df1: pandas.DataFrame, most_frequent_variants, equal=False):
"""
Find the trace with the minimum duration in seconds.
Not used when all traces are different
:param df1: dataframe of process
:param most_frequent_variants: case ids of most frequent traces
:param equal:
:return: concept:case:id of the variant with shortest duration
"""
# there are at least 2 equal variants, most_frequent_variants is an array like [0,1]
# take only the most frequent rows in dataframe, like [0,1]
if equal:
most_frequent_variants_df = df1.loc[df1['case:concept:name'].isin(most_frequent_variants)]
else:
most_frequent_variants_df = df1.iloc[most_frequent_variants, :]
# find the row with the smallest duration
durations = most_frequent_variants_df['duration'].tolist()
# return the index of the row with the smallest duration
min_duration_trace = most_frequent_variants_df.loc[most_frequent_variants_df['duration'] == min(durations)][
'case:concept:name'].tolist()[0]
return min_duration_trace, min(durations)
def _findMostFrequentTraces(df2: pandas.DataFrame, most_frequent_variants):
"""
Find the most frequent trace
:param df2: pandas dataframe
:param most_frequent_variants:
:return: case:concept:name of most frequent traces
"""
try:
# list composed by the first column (case:concept:name) of the most frequent rows
# (selected by row index, because most_frequent_variants is a list of indices)
most_frequent_traces = df2.iloc[most_frequent_variants, 1].values.tolist()
# find the longest sublist of case:concept:name
max_most_frequent_traces = max(most_frequent_traces, key=len)
# if all the sublist have 1 element, I'm in case 2
if len(max_most_frequent_traces) == 1:
return list(map(lambda a: a[0], most_frequent_traces)) # flattened list
# else there is a sublist with more element, case 3 where there are equal traces
else:
return max_most_frequent_traces
except Exception:
return most_frequent_variants
# get variants as list, each item represents a trace in the log
# [[0, 1], [2]]
variants = df2['variants'].tolist()
# longest variant is selected because it's the most frequent
# [0, 1]
most_frequent_variants = max(variants, key=len)
if len(most_frequent_variants) == 1:
# all variants are different, I need to check similarities or find the one with the
# shortest duration in the whole dataset
# Check similarities between all the strings in the log and return the most frequent one
# I don't need to check similarities in the other case, because there the strings are exactly the same
def func(name):
matches = df2.apply(lambda row: (fuzz.ratio(row[groupby_column], name) >= threshold), axis=1)
return [i for i, x in enumerate(matches) if x]
df3 = df2.apply(lambda row: func(row[groupby_column]), axis=1) # axis=1 means apply function to each row
most_frequent_variants = max(df3.tolist(), key=len)
if len(most_frequent_variants) == 1:
# there are no similar strings, all are different, so I find the one with the smallest duration
# in the whole dataset, I don't need to filter like in the other cases
# get all durations as list
durations = df1['duration'].tolist()
# find smallest duration and select row in dataframe with that duration
min_duration_trace = df1.loc[df1['duration'] == min(durations)]['case:concept:name'].tolist()[0]
if len(variants) == 1:
status_queue.put(
f"[PROCESS MINING] There is only 1 trace with duration: {min(durations)} sec")
else:
status_queue.put(
f"[PROCESS MINING] All {len(variants)} variants are different, "
f"case {min_duration_trace} is the shortest ({min(durations)} sec)")
else:
# some strings are similar, it should be like case below
min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants)
most_frequent_traces = _findMostFrequentTraces(df2, most_frequent_variants)
status_queue.put(
f"[PROCESS MINING] There are {len(variants)} variants, "
f"among the {len(most_frequent_traces)} similar traces, "
f"case {min_duration_trace} is the shortest ({duration} sec)")
print(f"[PROCESS MINING] Traces {most_frequent_traces} are similar by at least {threshold}%")
else:
# min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants)
# most_frequent_traces = _findMostFrequentTraces(df2, most_frequent_variants)
# self.status_queue.put(
# f"[PROCESS MINING] There are {len(variants)} variants, "
# f"among the {len(most_frequent_traces)} equal traces, "
# f"case {min_duration_trace} is the shortest ({duration} sec)")
# print(f"[PROCESS MINING] Traces {most_frequent_traces} are equal")
min_duration_trace, duration = _findVariantWithShortestDuration(df1, most_frequent_variants, equal=True)
status_queue.put(
f"[PROCESS MINING] There are {len(df1)} traces and {len(variants)} variants, "
f"among the {len(most_frequent_variants)} equal traces, "
f"case {min_duration_trace} is the shortest ({duration} sec)")
print(f"[PROCESS MINING] Traces {most_frequent_variants} are equal")
case = df.loc[df['case:concept:name'] == min_duration_trace]
# self.selected_trace = min_duration_trace
return case