# -----------------------------------------------------------------------------`
# ISMB'18 Tutorial
#
# CASMAP: Example 2, region-based GWAS
# -----------------------------------------------------------------------------`

# Import the CASMAP package
library(CASMAP)

# Set the paths to the input files (genotype, phenotype and covariate).
# Note: for simplicity, the data files are assumed to be located in the same directory as the scripts.
genotype_file  <- "X.dat"
phenotype_file <- "Y.dat"
covariate_file <- "C.dat"

# Create an object to perform region-based GWAS (interval search).
obj_rbg <- CASMAP(mode="regionGWAS")

# Set hyperparameters of the analysis:
# - alpha: Target Family-Wise Error Rate (FWER).
# max_comb_size: Maximum number of markers per region. For example, if set to 5, then only intervals with up to 5 
#                consecutive SNPs (inclusive) will be considered. To consider intervals of arbitrary length, 
#                use value 0 (default).
obj_rbg$setTargetFWER(alpha=0.05)
obj_rbg$setMaxCombinationSize(max_comb_size=0)


# Print the contents of the object.
print(obj_rbg)

# Read input files.
# The following is not necessary for A. thaliana. In general, for input genotype files using an additive encoding:
# 0 = homozygous major
# 1 = heterozygous
# 2 = homozygous minor
# use the extra input argument 'encoding' to select between a 
# - dominant (0 = homozygous major, 1 = heterozygous and homozygous minor) or a 
# - recessive encoding (0 = homozygous major and heterozygous, 1 = homozygous minor).
# Note: The covariate file is optional.
obj_rbg$readFiles(genotype_file=genotype_file, phenotype_file=phenotype_file, covariate_file=covariate_file)

# Print the state of the object again
print(obj_rbg)

# Run significant pattern mining algorithm to retrieve statistically associated genomic regions
obj_rbg$execute()

# In Example 1, we retrieved the summary information with getSummary(). We can also save it to a file.
# Write high-level summary and profiling info related to the execution of the algorithm
obj_rbg$writeSummary("output/summary.txt")
obj_rbg$writeProfile("output/profiling.txt")

# Write raw list of (possibly redundant) significantly associated multiplicative interactions of genomic variants
obj_rbg$writeSignificantRegions("output/significant_regions_raw.txt")

# Write post-processed list of disjoint clusters of significantly associated genomic regions
obj_rbg$writeSignificantClusterRepresentatives("output/significant_regions_clustered.txt")

# Read the output files and determine its size
results_raw = read.table("output/significant_regions_raw.txt", sep="\t", skip=1, header=FALSE)
results_clust = read.table("output/significant_regions_clustered.txt", sep="\t", skip=1, header=FALSE, 
                           col.names=c("p.value", "score", "OR", "index.set", "num.regions", "index.start", "index.end")) 

# Number of statistically significant intervals (before clustering)
dim(results_raw)[1]
dim(results_clust)[1]

# Show the top 10 most statistically significant intervals
sort_idx = order(results_clust$p.value)
head(results_clust[sort_idx, c("p.value", "score", "index.start", "index.end")], n=10)

# Read the contents of the profiling file
contents = readLines("output/profiling.txt")
print(contents)

# Read the contents of the summary file
contents = readLines("output/summary.txt")
print(contents)
