# Copyright (C) 2018 Jiaan Dai

"""Validation script using spectral library."""

########################
# data configurations
########################

LIBRARY_PATH = r'D:\Libraries\Human_HCD_Library\human_hcd_extracted.dat'
FASTADB_PATH = r'C:\Users\Jiaan\Desktop\PowerSimTool\data\uniprot-all.fasta'
DECOYDB_PATH = r'C:\Users\Jiaan\Desktop\PowerSimTool\data\uniprot-homo-sapiens.shuffle.fasta'

########################
# environment setup
########################

import gzip
import json
import re
import sys
sys.path.append('.')
from PowerSimTool.types import Spectrum
from PowerSimTool.loaders import load_peptides
from PowerSimTool.denovo import infer_tags
from PowerSimTool.search import search
from PowerSimTool.search import cosine_similarity as score
from PowerSimTool.mass import PROTON

########################
# database setup
########################

def _load_database(path):
    peptide_lower_mass = 700
    peptide_upper_mass = 5000
    peptides = load_peptides(path)
    peptides = [p for p in peptides
                if peptide_lower_mass <= p.mass <= peptide_upper_mass]
    peptide_mass = [p.mass for p in peptides]
    peptide_tags = [p.get_theo_tags(3) for p in peptides]
    return (peptide_mass, peptides, peptide_tags)

targets = _load_database(FASTADB_PATH)
decoys = _load_database(DECOYDB_PATH)

########################
# spectra data loading
########################

scan_num = 0
spectra_with_labels = []
with gzip.open(LIBRARY_PATH, 'rb') as library:
    for line in library:
        record = json.loads(str(line, encoding='utf-8'))

        # spec
        peaks = [(p['mass'], 1) for p in record['peaks']]
        charge = record['charge']
        precursor_mz = (record['precursor_mass'] + charge * PROTON) / charge
        spec = Spectrum(scan_num, peaks, precursor_mz, charge)

        # labels
        label = record['sequence']
        label = label.replace('I', 'L')
        num_peaks = record['num_peaks']
        assert num_peaks == len(record['peaks'])
        pattern = re.compile(r'^[by](\d+)$')
        unannotate = 0
        for p in record['peaks']:
            if pattern.match(p['annotation'].split('/')[0]):
                pass
            else:
                unannotate += 1
        signal_ratio = float(num_peaks - unannotate) / float(2 * len(label))
        noise_ratio = float(unannotate) / float(2 * len(label))
        num_mods = record['num_mods']

        spectra_with_labels.append({
            'spec': spec,
            'label': label,
            'signal_ratio': signal_ratio,
            'noise_ratio': noise_ratio,
            'num_mods': num_mods
        })
        scan_num += 1

########################
# validate existence
########################

whole_dbseq_pool = {}
for cand in targets[1]:
    whole_dbseq_pool[cand.sequence] = cand

spectra_with_labels_and_peptides = [(swl, whole_dbseq_pool[swl['label']])
    for swl in spectra_with_labels if swl['label'] in whole_dbseq_pool]

print('Total {} spectra are loaded from spectral library.'.format(
    len(spectra_with_labels)), file=sys.stderr)
print('Total {} spectra are under examination after validating existences.'.format(
    len(spectra_with_labels_and_peptides)), file=sys.stderr)

########################
# search in tags
########################

from datetime import datetime

i = 0
records = []
for swl, peptide in spectra_with_labels_and_peptides:
    spec = swl['spec']
    spec_tags = infer_tags(spec, [3], 0.02)[0]
    self_score = score(spec_tags, peptide.get_theo_tags(3))
    target_score, target_idx = search(spec_tags, targets[0], targets[2], 250,
                                      spec.precursor_mass, score)
    decoy_score, decoy_idx = search(spec_tags, decoys[0], decoys[2], 250,
                                    spec.precursor_mass, score)

    record = { 'spec': spec,
               'label': peptide,
               'signal_ratio': swl['signal_ratio'],
               'noise_ratio': swl['noise_ratio'],
               'num_mods': swl['num_mods'],
               'self_score': self_score,
               'target_id': targets[1][target_idx],
               'target_score': target_score,
               'decoy_id': decoys[1][decoy_idx],
               'decoy_score': decoy_score }
    records.append(record)

    i += 1
    if i % 500 == 0:
        print('{}\tFinish {} spectra.'.format(datetime.now(), i), file=sys.stderr)

########################
# print search results
########################

print('ScanNum,Label,SelfScore,TargetScore,TargetId,DecoyScore,DecoyId'
      ',SignalRatio,NoiseRatio,PrecursorMass,LabelMass,TargetMass,DecoyMass,NumMods')
for r in records:
    print('{},{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(
        r['spec']._scan_num, r['label'].sequence, r['self_score'],
        r['target_score'], r['target_id'].sequence, r['decoy_score'],
        r['decoy_id'].sequence, r['signal_ratio'], r['noise_ratio'],
        r['spec'].precursor_mass, r['label'].mass, r['target_id'].mass,
        r['decoy_id'].mass, r['num_mods']))
