# TRS Analysis Script for OSLC Connect Applications
# Input is Downloaded Zip from TRS Admin Page
# Output is a Usage Graph of Changes in last 7 days and a small number of metrics
# Optional Output is saved dataframe

import os
import sys
import zipfile

import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import re
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

def processchangeeventfile( soup ) :
    # navigate the soup to find all of the changes.  General format is as follows ->
    #   <trs:ChangeLog>
    #     <trs:change>
    #       <trs:Modification rdf:about="https://phoenix-jira.sodiuswillert.cloud/rest/oslc/1.0/cm/issue/AMRPRT-31#1744982241226_210169">
    #         <trs:order rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">210169</trs:order>
    #         <trs:changed rdf:resource="https://phoenix-jira.sodiuswillert.cloud/rest/oslc/1.0/cm/issue/AMRPRT-31"/>
    #       </trs:Modification>
    #     </trs:change>
    #     <trs:change>
    #    <trsLChangeLog>

    changes = soup.find_all('change')
    for changeNode in changes:
        for child in changeNode.children:  # Whitespace nodes to avoid
            if child.name:
                node = child
                changeType = child.name  # Type of change
                uri  = child.changed.attrs['rdf:resource']  # artifact
                order = child.order.text
                resource = child.attrs['rdf:about']
                changeUriPattern = r"(\S+)(\/rest\/oslc\/1.0\/)(\S+)(#)(\S+)(_)(\S+)"
                match = re.match(changeUriPattern, resource)
                if match:
                    artifact = match.group(3)
                    epoch = match.group(5)
                    changetime = datetime.fromtimestamp(int(epoch) / 1000)  # datetime of the change
                    # create the change event
                    event = [changeType, uri, changetime, artifact, order]
                    changeevents.append(event)
def processbaseeventfile( soup ) :
    # navigate the base files to find all included elements.  General formate is as follows ->
    # <ldp:Container rdf: about = "https://phoenix-jira.sodiuswillert.cloud/rest/oslc/1.0/cm/trs/base">
    #  <rdfs:member rdf:resource = "https://phoenix-jira.sodiuswillert.cloud/rest/oslc/1.0/cm/issue/TEST-43727"/>
    #  <rdfs:member rdf:resource = "https://phoenix-jira.sodiuswillert.cloud/rest/oslc/1.0/cm/issue/TEST-40638"/>
    # </ldp:Container>
    members = soup.find_all('member')
    for member in members:
        uri = member.attrs['rdf:resource']
        uriPattern = r"(\S+)(\/rest\/oslc\/1.0\/)(\S+)"
        match = re.match(uriPattern, uri)
        if match:
            artifact = match.group(3)
            # Todo ... we can parse more type information if we wanted
            event = [ 'Base', artifact, uri ]
            baseevents.append(event)

# Get the commandline parameters
archive = ''
dataframefile = ''

if len(sys.argv) >=2:
    archive = sys.argv[1]
    print('Source archive :' + archive)
    if len(sys.argv) >= 3:
        dataframefile = sys.argv[2]
        print('Target dataframe file :' + dataframefile)
else:
    print('Use arguments archive.zip (optional: dataframe.pkl')
    exit("missing input parameters")

# create some storage lists
baseevents = []
changeevents = []

print('Parsing files in archive for events')
with zipfile.ZipFile(archive) as zipf:
    for filename in tqdm (zipf.namelist() ):
        if not os.path.isdir(filename):
            # read the file
            if ( filename.endswith('.xml') ):
                with zipf.open(filename) as file:
                    soup = BeautifulSoup(file.read(), features='xml')
                    if filename.startswith('base/'):
                        processbaseeventfile(soup)
                    elif filename.startswith('changelog/'):
                        processchangeeventfile(soup)


print('Total Changes :' + str(len(changeevents)))
print('Total Base Members :' + str(len(baseevents)))

# Create a dataframe for changes for some analysis and plots
columns = ['change_type', 'uri', 'changetime', 'artifact', 'order']
changeeventdf = pd.DataFrame(columns = columns, data = changeevents)
# Sort by the order
changeeventdf = changeeventdf.sort_values(by = ['order'], ascending = True)
# We want a simplifed date-hour for looking at change rates hourly across multiple days
changeeventdf['eventdayhour'] = changeeventdf['changetime'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d-%H'))
# Print a change summary
print ( 'Summary of changes hourly in the change log')
print ( changeeventdf['eventdayhour'].value_counts().sort_index() )
print ( 'The 20 most often changed items in the change log')
print ( changeeventdf['artifact'].value_counts().head(20) )
print ( 'Summary of the types of changes')
print ( changeeventdf['change_type'].value_counts() )

# Create the TRS Event Plot
analysisDate = changeeventdf.iloc[-1]['changetime'].strftime('%Y-%m-%d')

activity = plt.figure()
activity.set_figheight(15)
activity.set_figwidth(15)
activity.suptitle(f'TRS Change Data ending on {analysisDate}')

# create some dataframes split by event tpye
creationevents = changeeventdf[changeeventdf['change_type'] == 'Creation']
deletionevents = changeeventdf[changeeventdf['change_type'] == 'Deletion']
modifiedevents = changeeventdf[changeeventdf['change_type'] == 'Modification']

# Provide the range of all possible times in range
# Identify the number of slots we should have
graphSeries = pd.Series (pd.date_range(start=changeeventdf['changetime'].min(), end=changeeventdf['changetime'].max(), freq='h'))
graphSeries = graphSeries.apply(lambda x: datetime.strftime(x, '%Y-%m-%d-%H'))

# provide a fill option to fill missing items in the serires
def fillSeries (series) :
    for value in graphSeries :
        if value not in series :
            series.loc[value] = 0

generalActivity = activity.add_subplot(3,1,1)
labels = graphSeries

trsEvents = changeeventdf['eventdayhour'].value_counts()
fillSeries (trsEvents)
trsEvents = trsEvents.sort_index()
generalActivity.plot( trsEvents, color='blue', linestyle='solid', marker='o', label='total trs events')

creationEvents = creationevents['eventdayhour'].value_counts()
fillSeries(creationEvents)
creationEvents = creationEvents.sort_index()
generalActivity.plot( creationEvents, color='green', linestyle='solid', marker='o', label="creations")

deletionEvents = deletionevents['eventdayhour'].value_counts()
fillSeries(deletionEvents)
deletionEvents = deletionEvents.sort_index()
generalActivity.plot( deletionEvents, color='red', linestyle='solid', marker='o', label="deletions")

modificationEvents = modifiedevents['eventdayhour'].value_counts()
fillSeries(modificationEvents)
modificationEvents = modificationEvents.sort_index()
generalActivity.plot( modificationEvents, color='orange', linestyle='solid', marker='o', label="modifications")

# Annotate some important points
generalActivity.annotate("creation high " + str(creationEvents.max()), xy=(creationEvents.idxmax(),creationEvents.max()), arrowprops=dict(arrowstyle='->', color='green'))
generalActivity.annotate("deletion high " + str(deletionEvents.max()), xy=(deletionEvents.idxmax(),deletionEvents.max()), arrowprops=dict(arrowstyle='->', color='red'))
generalActivity.annotate("modification high " + str(modificationEvents.max()), xy=(modificationEvents.idxmax(),modificationEvents.max()), arrowprops=dict(arrowstyle='->', color='orange'))

increments = np.linspace(0, len(trsEvents) -2, 8, dtype=int, endpoint=True)
generalActivity.set_xticks( increments )
generalActivity.set_xticklabels( labels.iloc[increments], rotation=90 )
generalActivity.set_xlabel("Day-Hour")
generalActivity.set_ylabel("Number of TRS Change Events")
generalActivity.set_title('TRS Changes by Hour')
generalActivity.legend()

activity.savefig("trsDiagrams_"+analysisDate+".png",
                 bbox_inches='tight',
                 pad_inches=0.5,)

if dataframefile:
    changeeventdf.to_pickle(dataframefile)