import numpy
from numpy import std
from sys import exit, stdin, stdout, stderr
from os import path, listdir
from argparse import ArgumentError, ArgumentParser, SUPPRESS
from pandas import read_table
from sklearn.linear_model import (LassoCV, LassoLarsCV, LassoLarsIC,
LinearRegression, RidgeCV, ElasticNetCV)
from sklearn.grid_search import GridSearchCV
from matplotlib import rc_params_from_file
from functools import partial
from itertools import chain, repeat
import plotypus.lightcurve
from plotypus.lightcurve import (make_predictor, get_lightcurve_from_file,
plot_lightcurve)
from plotypus.periodogram import Lomb_Scargle, conditional_entropy
import plotypus
from plotypus.preprocessing import Fourier
from plotypus.utils import mad, pmap, verbose_print
from plotypus.resources import matplotlibrc
import pkg_resources # part of setuptools
__version__ = pkg_resources.require("plotypus")[0].version
[docs]def get_args():
parser = ArgumentParser()
general_group = parser.add_argument_group('General')
param_group = parser.add_argument_group('Star Parameters')
parallel_group = parser.add_argument_group('Parallel')
period_group = parser.add_argument_group('Periodogram')
fourier_group = parser.add_argument_group('Fourier')
outlier_group = parser.add_argument_group('Outlier Detection')
# regression_group = parser.add_argument_group('Regression')
parser.add_argument('--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
general_group.add_argument('-i', '--input', type=str,
default=None,
help='location of stellar observations '
'(default = stdin)')
general_group.add_argument('-o', '--output', type=str,
default=None,
help='location of plots, or nothing if no plots are to be generated '
'(default = None)')
general_group.add_argument('-n', '--star-name', type=str,
default=None,
help='name of star '
'(default = name of input file)')
general_group.add_argument('-f', '--format', type=str,
default='%.5f',
help='format specifier for output table')
general_group.add_argument('--output-sep', type=str,
default='\t',
help='column separator string in output table '
'(default = TAB)')
general_group.add_argument('--no-header', action='store_true',
help='suppress header row in table output')
general_group.add_argument('--sanitize-latex', action='store_true',
help='enable to sanitize star names for LaTeX formatting')
general_group.add_argument('--legend', action='store_true',
help='whether legends should be put on the output plots '
'(default = False)')
general_group.add_argument('--extension', type=str,
default='.dat', metavar='EXT',
help='extension which follows a star\'s name in data filenames '
'(default = ".dat")')
general_group.add_argument('--skiprows', type=int,
default=0,
help='number of rows at the head of each file to skip')
general_group.add_argument('--use-cols', type=int, nargs='+',
default=SUPPRESS,
help='columns to use from data file '
'(default = 0 1 2)')
general_group.add_argument('-s', '--scoring', type=str,
choices=['MSE', 'R2'], default=SUPPRESS,
help='scoring metric to use '
'(default = "R2")')
general_group.add_argument('--scoring-cv', type=int,
default=SUPPRESS, metavar='N',
help='number of folds in the scoring regularization_cv validation '
'(default = 3)')
general_group.add_argument('--shift', type=float,
default=None,
help='phase shift to apply to each light curve, or shift to max '
'light if None given '
'(default = None)')
general_group.add_argument('--phase-points', type=int,
default=100, metavar='N',
help='number of phase points to output '
'(default = 100)')
general_group.add_argument('--min-phase-cover', type=float,
default=SUPPRESS, metavar='COVER',
help='minimum fraction of phases that must have points '
'(default = 0)')
general_group.add_argument('--min-observations', type=int,
default=1, metavar='N',
help='minimum number of observation needed to avoid skipping a star '
'(default = 1)')
general_group.add_argument('--matplotlibrc', type=str,
default=matplotlibrc,
metavar='RC',
help='matplotlibrc file to use for formatting plots '
'(default file is in plotypus.resources.matplotlibrc)')
general_group.add_argument('-v', '--verbosity', type=str, action='append',
default=None, choices=['all', 'coverage', 'outlier', 'period'],
metavar='OPERATION',
help='specifies an operation to print verbose output for, or '
'"all" to print all verbose output '
'(default = None)')
param_group.add_argument('--parameters', type=str,
default=None, metavar='FILE',
help='file containing table of parameters such as period and shift '
'(default = None)')
param_group.add_argument('--param-sep', type=str,
default="\\s+",
help='string or regex to use as column separator when reading '
'parameters file '
'(default = any whitespace)')
param_group.add_argument('--period-label', type=str,
default='Period', metavar='LABEL',
help='title of period column in parameters file '
'(default = Period)')
param_group.add_argument('--shift-label', type=str,
default='Shift', metavar='LABEL',
help='title of shift column in parameters file '
'(default = Shift)')
parallel_group.add_argument('--star-processes', type=int,
default=1, metavar='N',
help='number of stars to process in parallel '
'(default = 1)')
parallel_group.add_argument('--selector-processes', type=int,
default=SUPPRESS, metavar='N',
help='number of processes to use for each selector '
'(default depends on selector used)')
parallel_group.add_argument('--scoring-processes', type=int,
default=SUPPRESS, metavar='N',
help='number of processes to use for scoring, if not done by selector '
'(default = 1)')
parallel_group.add_argument('--period-processes', type=int,
default=1, metavar='N',
help='number of periods to process in parallel '
'(default = 1)')
period_group.add_argument('--period', type=float,
default=None,
help='period to use for all stars '
'(default = None)')
period_group.add_argument('--min-period', type=float,
default=SUPPRESS, metavar='P',
help='minimum period of each star '
'(default = 0.2)')
period_group.add_argument('--max-period', type=float,
default=SUPPRESS, metavar='P',
help='maximum period of each star '
'(default = 32.0)')
period_group.add_argument('--coarse-precision', type=float,
default=SUPPRESS,
help='level of granularity on first pass '
'(default = 0.00001)')
period_group.add_argument('--fine-precision', type=float,
default=SUPPRESS,
help='level of granularity on second pass '
'(default = 0.000000001)')
period_group.add_argument('--periodogram', type=str,
choices=["Lomb_Scargle", "conditional_entropy"],
default="Lomb_Scargle",
help='method for determining period '
'(default = Lomb_Scargle)')
fourier_group.add_argument('-d', '--fourier-degree', type=int, nargs=2,
default=(2, 20), metavar=('MIN', 'MAX'),
help='range of degrees of fourier fits to use '
'(default = 2 20)')
fourier_group.add_argument('-r', '--regressor',
choices=['LassoCV', 'LassoLarsCV', 'LassoLarsIC', 'OLS', 'RidgeCV', 'ElasticNetCV'],
default='LassoLarsIC',
help='type of regressor to use '
'(default = "Lasso")')
fourier_group.add_argument('--selector',
choices=['Baart', 'GridSearch'],
default='GridSearch',
help='type of model selector to use '
'(default = "GridSearch")')
fourier_group.add_argument('--series-form', type=str,
default='cos', choices=['sin', 'cos'],
help='form of Fourier series to use in coefficient output, '
'does not affect the fit '
'(default = "cos")')
fourier_group.add_argument('--max-iter', type=int,
default=1000, metavar='N',
help='maximum number of iterations in the regularization path '
'(default = 1000)')
fourier_group.add_argument('--regularization-cv', type=int,
default=None, metavar='N',
help='number of folds used in regularization regularization_cv validation '
'(default = 3)')
outlier_group.add_argument('--sigma', type=float,
default=SUPPRESS,
help='rejection criterion for outliers '
'(default = 20)')
outlier_group.add_argument('--sigma-clipping', type=str,
choices=["std", "mad"], default="mad",
help='sigma clipping metric to use '
'(default = "mad")')
args = parser.parse_args()
if args.output is not None:
rcParams = rc_params_from_file(fname=args.matplotlibrc,
fail_on_error=args.output)
plotypus.lightcurve.matplotlib.rcParams = rcParams
regressor_choices = {
"LassoCV" : LassoCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsCV" : LassoLarsCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False),
"LassoLarsIC" : LassoLarsIC(max_iter=args.max_iter,
fit_intercept=False),
"OLS" : LinearRegression(fit_intercept=False),
"RidgeCV" : RidgeCV(cv=args.regularization_cv,
fit_intercept=False),
"ElasticNetCV" : ElasticNetCV(max_iter=args.max_iter,
cv=args.regularization_cv,
fit_intercept=False)
}
selector_choices = {
"Baart" : None,
"GridSearch" : GridSearchCV
}
periodogram_choices = {
"Lomb_Scargle" : Lomb_Scargle,
"conditional_entropy" : conditional_entropy
}
sigma_clipping_choices = {
"std" : std,
"mad" : mad
}
if hasattr(args, 'scoring'):
scoring_choices = {
'R2' : 'r2',
'MSE' : 'mean_squared_error'
}
args.scoring = scoring_choices[args.scoring]
args.regressor = regressor_choices[args.regressor]
Selector = selector_choices[args.selector] or GridSearchCV
args.periodogram = periodogram_choices[args.periodogram]
args.sigma_clipping = sigma_clipping_choices[args.sigma_clipping]
args.predictor = make_predictor(Selector=Selector,
use_baart=(args.selector == 'Baart'),
**vars(args))
args.phases = numpy.arange(0, 1, 1/args.phase_points)
if args.parameters is not None:
args.parameters = read_table(args.parameters, args.param_sep,
index_col=0, engine='python')
return args
[docs]def main():
args = get_args()
min_degree, max_degree = args.fourier_degree
filenames = list(map(lambda x: x.strip(), _get_files(args.input)))
filepaths = map(lambda filename:
filename if path.isfile(filename)
else path.join(args.input, filename),
filenames)
# a dict containing all options which can be pickled, because
# all parameters to pmap must be picklable
picklable_args = {k: vars(args)[k]
for k in vars(args)
if k not in {'input'}}
sep = args.output_sep
if not args.no_header:
# print file header
print(*['Name',
'Period',
'Shift',
'Coverage',
'Inliers',
'Outliers',
'R^2',
'MSE',
'MaxDegree',
'Params',
'A_0',
'dA_0',
sep.join(map(('A_{0}' + sep + 'Phi_{0}').format,
range(1, max_degree+1))),
sep.join(map(('R_{0}1' + sep + 'phi_{0}1').format,
range(2, max_degree+1))),
sep.join(map('Phase{}'.format,
range(args.phase_points)))],
sep=sep)
printer = lambda result: _print_star(result, max_degree, args.series_form,
args.format, sep) \
if result is not None else None
pmap(process_star, filepaths, callback=printer,
processes=args.star_processes, **picklable_args)
[docs]def process_star(filename, output, *, extension, star_name, period, shift,
parameters, period_label, shift_label, **kwargs):
"""Processes a star's lightcurve, prints its coefficients, and saves
its plotted lightcurve to a file. Returns the result of get_lightcurve.
"""
if star_name is None:
basename = path.basename(filename)
if basename.endswith(extension):
star_name = basename[:-len(extension)]
else:
# file has wrong extension
return
if parameters is not None:
if period is None:
try:
period = parameters[period_label][star_name]
except KeyError:
pass
if shift is None:
try:
shift = parameters.loc[shift_label][star_name]
except KeyError:
pass
result = get_lightcurve_from_file(filename, name=star_name,
period=period, shift=shift,
**kwargs)
if result is None:
return
if output is not None:
plot_lightcurve(star_name, result['lightcurve'], result['period'],
result['phased_data'], output=output, **kwargs)
return result
def _print_star(result, max_degree, form, fmt, sep):
if result is None:
return
# function which formats every number in a sequence according to fmt
format_all = partial(map, lambda x: fmt % x)
# count inliers and outliers
points = result['phased_data'][:,0].size
outliers = numpy.ma.count_masked(result['phased_data'][:, 0])
inliers = points - outliers
# get fourier coefficients and compute ratios
coefs = Fourier.phase_shifted_coefficients(result['coefficients'],
shift=result['shift'],
form=form)
_coefs = numpy.concatenate(([coefs[0]],
[result['dA_0']],
coefs[1:]))
fourier_ratios = Fourier.fourier_ratios(coefs)
# create the vectors of zeroes
coef_zeros = repeat('0', times=(2*max_degree + 1 - len(coefs)))
ratio_zeros = repeat('0', times=(2*(max_degree - 1) - len(fourier_ratios)))
max_degree = numpy.trim_zeros(coefs[1::2], 'b').size
n_params = numpy.count_nonzero(coefs[1::2])
# print the entry for the star with tabs as separators
# and itertools.chain to separate the different results into a
# continuous list which is then unpacked
print(*chain(*[[result['name']],
map(str,
[result['period'], result['shift'], result['coverage'],
inliers, outliers,
result['R2'], result['MSE'],
max_degree, n_params]),
# coefficients and fourier ratios with trailing zeros
# formatted defined by the user-provided fmt string
format_all(_coefs), coef_zeros,
format_all(fourier_ratios), ratio_zeros,
format_all(result['lightcurve'])]),
sep=sep)
def _get_files(input):
if input is None:
return stdin
elif input[0] == "@":
with open(input[1:], 'r') as f:
return map(lambda x: x.strip(), f.readlines())
elif path.isfile(input):
return [input]
elif path.isdir(input):
return sorted(listdir(input))
else:
raise FileNotFoundError('file {} not found'.format(input))
if __name__ == "__main__":
exit(main())