# -----------------------------------------------------------------------------
# Preprocess the Terada et al. 2013 data to be used by CASMAP
#
# June 2018, D. Roqueiro
# -----------------------------------------------------------------------------

import sys
import os
import numpy as np
import argparse

# -----------------------------------------------------------------------------
# Main program
# -----------------------------------------------------------------------------

# Set up the parsing of command-line arguments
parser = argparse.ArgumentParser(description="Convert breast cancer dataset (from Terada 2013 to CASMAP)")
parser.add_argument("--tx_file", required=True,
                    help="Full path to file with the transaction list")
parser.add_argument("--matrix_file", required=True,
                    help="Full path to output binary matrix")
args = parser.parse_args()

# The transaction file tx_file contains a transaction list of the form:
#   TF1   TF3   TF21
#   TF7   TF9
#   
#   TF2   TF6
#   :
# Each row is a gene, and it contains a list of TF ids obtained from MSigDB (see file tf_list.txt)
# Note that some lines are empty. This means that no TF was linked to that gene.
# 
# Do a first pass to determine the dimensions of the matrix we need to create:
# - number of genes (rows)
# - number of motifs (columns)
# Note: The final matrix is transposed with respect to the transaction list
try:
    f_in = open(args.tx_file, 'r')
except IOError:
    print "Cannot open input file %s" % args.tx_file
    sys.exit(1)
# Read and find the maximum motif id
line_num = 0
max_motif = -1
for line in f_in:
    # Count the lines
    line_num += 1
    line = line.strip()
    # Skip empty lines
    if line == "":
        continue

    # Split the line
    parts = line.split(' ')
    # Get the last field (motif_ids are sorted in increasing order)
    motif_id = int(parts[len(parts) - 1])
    # Determine the maximum value of the TF ids. This will indicate how many TFs we need to consider
    if motif_id > max_motif:
        max_motif = motif_id

f_in.close()

# Set the dimensions for the matrix
# Initialize to zeros
mat = np.zeros((line_num, max_motif + 1), dtype=np.int32)

# Read the file again and populate the matrix created above
i = -1
f_in = open(args.tx_file, 'r')
for line in f_in:
    # Column index
    i += 1
    line = line.strip()
    # Skip empty lines
    if line == "":
        continue

    # Split the line
    parts = line.strip().split(' ')
    # Iterate through each motif
    for j in parts:
        mat[i, int(j)] = 1

f_in.close()

# Save the matrix to a file
np.savetxt(args.matrix_file, np.transpose(mat), fmt="%d")

