#!/usr/bin/python

import argparse
import json
import os
import re

VERSION = "1.0.1"

parser = argparse.ArgumentParser(description='Create a BAR input JSON file.')
parser.add_argument('pipeline-file', help='which pipeline JSON file to read static values from')
parser.add_argument('fastq-folder', help='which FastQ folder to process')
parser.add_argument('--save-folder', help='where to save the output JSON file', default=os.getcwd())
parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
parser.add_argument('-V', help='print version and exit', action='version', version='%(prog)s ' + VERSION)
args = parser.parse_args()


def log(msg):
    if args.verbose:
        print(msg)


def remove_trailing_slash(folder):
    lenght = len(folder)
    if lenght > 1 and folder.endswith('/'):
        folder = folder[:lenght - 1]
    return folder


# getattr() is a hack: https://stackoverflow.com/a/20250435
pipeline_file = getattr(args, 'pipeline-file')
fastq_folder = remove_trailing_slash(getattr(args, 'fastq-folder'))
save_folder = remove_trailing_slash(args.save_folder)

with open(pipeline_file, 'r') as f:
    pipeline = json.loads(f.read())

print('Will process FastQ folder: ' + fastq_folder)

# Each folder is a BAR
bar = {
    'v': 1,
    # The directory name is the BAR name, or "user_ref"
    'user_ref': os.path.basename(fastq_folder),
    'sequencer': pipeline['sequencer'],
    'pairend': pipeline['pairend'],
    'analyses': []
}


# This is how we extract data from file names.
# We split the name by '_', walk in reverse, skipping the first part (last of file name).
# The "lane" is optional; might not be present in the returned dictionary.
def extract_from(file_name):
    log('Processing "' + file_name + '"...')
    tokens = file_name.split('_')
    tokens.reverse()

    n = len(tokens)
    if n < 4 or n > 5:
        print("File " + file_name + " has " + str(n) + " token(s) in it. Expected 4 or 5.")
        exit()

    log('   Ignoring last token: ' + tokens[0])

    data = {}
    for t in tokens[1::]:
        if re.search('^R[0-9]$', t):
            data['r'] = t.lower()  # e.g. "r1"/"r2"
            log('   Parsed "r" value: ' + data['r'])
        elif re.search('^L[0-9]+$', t):
            data['lane'] = t  # e.g. "L001"
            log('   Parsed "lane" value: ' + data['lane'])
        elif re.search('^S[0-9]+$', t):
            data['mid'] = t  # e.g. "S1"
            log('   Parsed "mid" value: ' + data['mid'])
        else:
            patient_last = t
            log('   Parsed "patient"\'s last piece of token: ' + patient_last)
            break

    # e.g. "GN2804-16A8271-2-subset2"
    data['patient'] = file_name[:file_name.find(patient_last) + len(patient_last)]
    log('   Parsed full "patient" value: ' + data['patient'])
    return data


# Source: https://stackoverflow.com/a/3207973
filenames_in_folder = [f for f in os.listdir(fastq_folder) if
                       os.path.isfile(os.path.join(fastq_folder, f)) and not f.startswith('.') and f.endswith(
                           '.fastq.gz')]

patient_files = {}
for f in filenames_in_folder:
    data = extract_from(f)

    patient = data['patient']
    mid = data['mid']
    lane = data.get('lane', 'N/A')
    r = data['r']

    if patient not in patient_files:
        patient_files[patient] = {}

    if mid not in patient_files[patient]:
        patient_files[patient][mid] = {}

    if lane not in patient_files[patient][mid]:
        patient_files[patient][mid][lane] = {}

    if r not in patient_files[patient][mid][lane]:
        patient_files[patient][mid][lane][r] = os.path.abspath(fastq_folder) + '/' + f

for patient in patient_files.keys():
    for mid in patient_files[patient].keys():
        sample = {
            'analysis_type': pipeline['analysis_type'],
            'user_ref': patient,
            'mid': mid,
            'experiment_type': pipeline['experiment_type'],
            'kit': pipeline['kit'],
            'pairend': pipeline['pairend'],
            'files': []
        }

        for lane in patient_files[patient][mid]:
            sample['files'].append(patient_files[patient][mid][lane])

        bar['analyses'].append(sample)

if not bar['analyses']:
    print('No FastQ files were found. Maybe try another folder?')
    exit()

save_file = os.path.abspath(os.path.join(save_folder, bar['user_ref'] + '.json'))

with open(save_file, 'w') as f:
    f.write(json.dumps(bar, indent=2))
    print('Saved a new BAR JSON: ' + save_file)
