# Copyright (C) 2018 Jiaan Dai

"""Script for generating .mgf from .jsonl.gz file.

The input .jsonl.gz file (or .dat file) should be converted from .msp or
.sptxt spectral library using extract_library.py, which will only extract
necessary fields for the future analysis. We always separate the raw data
loading from actual processing.
"""

import argparse
import gzip
import json
import re
from pyteomics import mgf  # inapplicable for remote usage

import sys
sys.path.append('.')
from PowerSimTool.mass import PROTON
from PowerSimTool.loaders import load_peptides


def gen_seq_pool(dbpath):
    peptide_lower_mass = 700
    peptide_upper_mass = 5000
    peptides = load_peptides(dbpath)
    peptides = [p for p in peptides
                if peptide_lower_mass <= p.mass <= peptide_upper_mass]
    pool = set()
    for p in peptides:
        pool.add(p.sequence)
    return pool


def load_spectra(library_path, seq_pool, params_out):
    scan_num = 1
    with gzip.open(library_path, 'rb') as library:
        for line in library:
            record = json.loads(str(line, encoding='utf-8'))

            # basic
            mz_array = [p['mass'] for p in record['peaks']]
            in_array = [p['intensity'] for p in record['peaks']]
            charge = record['charge']
            precursor_mz = (record['precursor_mass'] + charge * PROTON) / charge

            # label
            label = record['sequence']
            label = label.replace('I', 'L')
            if not label in seq_pool:
                continue
            num_peaks = record['num_peaks']
            assert num_peaks == len(record['peaks'])
            pattern = re.compile(r'^[by](\d+)$')
            unannotate = 0
            for p in record['peaks']:
                if pattern.match(p['annotation'].split('/')[0]):
                    pass
                else:
                    unannotate += 1
            signal_ratio = float(num_peaks - unannotate) / float(2 * len(label))
            noise_ratio = float(unannotate) / float(2 * len(label))

            # build
            title = {
                'scan': scan_num,
                'seq': label,
                'nmod': record['num_mods'],
                'r_s': '{:.4f}'.format(signal_ratio),
                'r_n': '{:.4f}'.format(noise_ratio)
            }
            spec = {
                'm/z array': mz_array,
                'intensity array': in_array,
                'params': {
                    'CHARGE': str(charge) + '+',
                    'PEPMASS': precursor_mz,
                    'TITLE': 'converted.' + str(scan_num) + '.' + str(scan_num) + '.' + str(charge),
                    'SCANS': scan_num
                }
            }
            params_out.append(json.dumps(title, separators=(',', ':')))
            yield spec
            scan_num += 1


def main(library_path, dbpath, output_path, param_output):
    seq_pool = gen_seq_pool(dbpath)
    collected_params = []
    mgf.write(spectra=load_spectra(library_path, seq_pool, collected_params),
              output=output_path)
    with open(param_output, 'w') as file:
        for line in collected_params:
            file.write(line)
            file.write('\n')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--library', required=True,
                        help='input .jsonl.gz library file')
    parser.add_argument('--database', required=True,
                        help='input .fasta sequence database for existence check')
    parser.add_argument('-o', required=True,
                        help='output .mgf file')
    parser.add_argument('-p', required=True,
                        help='output .jsonl file of spectrum params')
    args = parser.parse_args()
    main(args.library, args.database, args.o, args.p)
