## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#importosimportoperatorimportsysimportuuidimportwarningsfromabcimportABCMeta,abstractmethodfrommultiprocessing.poolimportThreadPoolfromtypingimport(Any,Dict,Generic,Iterable,List,Optional,Type,TypeVar,Union,cast,overload,TYPE_CHECKING,)frompysparkimportkeyword_only,since,SparkContext,inheritable_thread_targetfrompyspark.mlimportEstimator,Predictor,PredictionModel,Modelfrompyspark.ml.param.sharedimport(HasRawPredictionCol,HasProbabilityCol,HasThresholds,HasRegParam,HasMaxIter,HasFitIntercept,HasTol,HasStandardization,HasWeightCol,HasAggregationDepth,HasThreshold,HasBlockSize,HasMaxBlockSizeInMB,Param,Params,TypeConverters,HasElasticNetParam,HasSeed,HasStepSize,HasSolver,HasParallelism,)frompyspark.ml.treeimport(_DecisionTreeModel,_DecisionTreeParams,_TreeEnsembleModel,_RandomForestParams,_GBTParams,_HasVarianceImpurity,_TreeClassifierParams,)frompyspark.ml.regressionimport_FactorizationMachinesParams,DecisionTreeRegressionModelfrompyspark.ml.baseimport_PredictorParamsfrompyspark.ml.utilimport(DefaultParamsReader,DefaultParamsWriter,JavaMLReadable,JavaMLReader,JavaMLWritable,JavaMLWriter,MLReader,MLReadable,MLWriter,MLWritable,HasTrainingSummary,)frompyspark.ml.wrapperimportJavaParams,JavaPredictor,JavaPredictionModel,JavaWrapperfrompyspark.ml.commonimportinherit_docfrompyspark.ml.linalgimportMatrix,Vector,Vectors,VectorUDTfrompyspark.sqlimportDataFrame,Rowfrompyspark.sql.functionsimportudf,whenfrompyspark.sql.typesimportArrayType,DoubleTypefrompyspark.storagelevelimportStorageLevelifTYPE_CHECKING:frompyspark.ml._typingimportP,ParamMapfrompy4j.java_gatewayimportJavaObjectT=TypeVar("T")JPM=TypeVar("JPM",bound=JavaPredictionModel)CM=TypeVar("CM",bound="ClassificationModel")__all__=["LinearSVC","LinearSVCModel","LinearSVCSummary","LinearSVCTrainingSummary","LogisticRegression","LogisticRegressionModel","LogisticRegressionSummary","LogisticRegressionTrainingSummary","BinaryLogisticRegressionSummary","BinaryLogisticRegressionTrainingSummary","DecisionTreeClassifier","DecisionTreeClassificationModel","GBTClassifier","GBTClassificationModel","RandomForestClassifier","RandomForestClassificationModel","RandomForestClassificationSummary","RandomForestClassificationTrainingSummary","BinaryRandomForestClassificationSummary","BinaryRandomForestClassificationTrainingSummary","NaiveBayes","NaiveBayesModel","MultilayerPerceptronClassifier","MultilayerPerceptronClassificationModel","MultilayerPerceptronClassificationSummary","MultilayerPerceptronClassificationTrainingSummary","OneVsRest","OneVsRestModel","FMClassifier","FMClassificationModel","FMClassificationSummary","FMClassificationTrainingSummary",]class_ClassifierParams(HasRawPredictionCol,_PredictorParams):""" Classifier Params for classification tasks. .. versionadded:: 3.0.0 """pass@inherit_docclassClassifier(Predictor[CM],_ClassifierParams,Generic[CM],metaclass=ABCMeta):""" Classifier for classification tasks. Classes are indexed {0, 1, ..., numClasses - 1}. """@since("3.0.0")defsetRawPredictionCol(self:"P",value:str)->"P":""" Sets the value of :py:attr:`rawPredictionCol`. """returnself._set(rawPredictionCol=value)@inherit_docclassClassificationModel(PredictionModel,_ClassifierParams,metaclass=ABCMeta):""" Model produced by a ``Classifier``. Classes are indexed {0, 1, ..., numClasses - 1}. """@since("3.0.0")defsetRawPredictionCol(self:"P",value:str)->"P":""" Sets the value of :py:attr:`rawPredictionCol`. """returnself._set(rawPredictionCol=value)@property@abstractmethod@since("2.1.0")defnumClasses(self)->int:""" Number of classes (values which the label can take). """raiseNotImplementedError()@abstractmethod@since("3.0.0")defpredictRaw(self,value:Vector)->Vector:""" Raw prediction for each possible label. """raiseNotImplementedError()class_ProbabilisticClassifierParams(HasProbabilityCol,HasThresholds,_ClassifierParams):""" Params for :py:class:`ProbabilisticClassifier` and :py:class:`ProbabilisticClassificationModel`. .. versionadded:: 3.0.0 """pass@inherit_docclassProbabilisticClassifier(Classifier,_ProbabilisticClassifierParams,metaclass=ABCMeta):""" Probabilistic Classifier for classification tasks. """@since("3.0.0")defsetProbabilityCol(self:"P",value:str)->"P":""" Sets the value of :py:attr:`probabilityCol`. """returnself._set(probabilityCol=value)@since("3.0.0")defsetThresholds(self:"P",value:List[float])->"P":""" Sets the value of :py:attr:`thresholds`. """returnself._set(thresholds=value)@inherit_docclassProbabilisticClassificationModel(ClassificationModel,_ProbabilisticClassifierParams,metaclass=ABCMeta):""" Model produced by a ``ProbabilisticClassifier``. """@since("3.0.0")defsetProbabilityCol(self:CM,value:str)->CM:""" Sets the value of :py:attr:`probabilityCol`. """returnself._set(probabilityCol=value)@since("3.0.0")defsetThresholds(self:CM,value:List[float])->CM:""" Sets the value of :py:attr:`thresholds`. """returnself._set(thresholds=value)@abstractmethod@since("3.0.0")defpredictProbability(self,value:Vector)->Vector:""" Predict the probability of each class given the features. """raiseNotImplementedError()@inherit_docclass_JavaClassifier(Classifier,JavaPredictor[JPM],Generic[JPM],metaclass=ABCMeta):""" Java Classifier for classification tasks. Classes are indexed {0, 1, ..., numClasses - 1}. """@since("3.0.0")defsetRawPredictionCol(self:"P",value:str)->"P":""" Sets the value of :py:attr:`rawPredictionCol`. """returnself._set(rawPredictionCol=value)@inherit_docclass_JavaClassificationModel(ClassificationModel,JavaPredictionModel[T]):""" Java Model produced by a ``Classifier``. Classes are indexed {0, 1, ..., numClasses - 1}. To be mixed in with :class:`pyspark.ml.JavaModel` """@property@since("2.1.0")defnumClasses(self)->int:""" Number of classes (values which the label can take). """returnself._call_java("numClasses")@since("3.0.0")defpredictRaw(self,value:Vector)->Vector:""" Raw prediction for each possible label. """returnself._call_java("predictRaw",value)@inherit_docclass_JavaProbabilisticClassifier(ProbabilisticClassifier,_JavaClassifier[JPM],Generic[JPM],metaclass=ABCMeta):""" Java Probabilistic Classifier for classification tasks. """pass@inherit_docclass_JavaProbabilisticClassificationModel(ProbabilisticClassificationModel,_JavaClassificationModel[T]):""" Java Model produced by a ``ProbabilisticClassifier``. """@since("3.0.0")defpredictProbability(self,value:Vector)->Vector:""" Predict the probability of each class given the features. """returnself._call_java("predictProbability",value)@inherit_docclass_ClassificationSummary(JavaWrapper):""" Abstraction for multiclass classification results for a given model. .. versionadded:: 3.1.0 """@property@since("3.1.0")defpredictions(self)->DataFrame:""" Dataframe outputted by the model's `transform` method. """returnself._call_java("predictions")@property@since("3.1.0")defpredictionCol(self)->str:""" Field in "predictions" which gives the prediction of each class. """returnself._call_java("predictionCol")@property@since("3.1.0")deflabelCol(self)->str:""" Field in "predictions" which gives the true label of each instance. """returnself._call_java("labelCol")@property@since("3.1.0")defweightCol(self)->str:""" Field in "predictions" which gives the weight of each instance as a vector. """returnself._call_java("weightCol")@propertydeflabels(self)->List[str]:""" Returns the sequence of labels in ascending order. This order matches the order used in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. .. versionadded:: 3.1.0 Notes ----- In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the training set is missing a label, then all of the arrays over labels (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the expected numClasses. """returnself._call_java("labels")@property@since("3.1.0")deftruePositiveRateByLabel(self)->List[float]:""" Returns true positive rate for each label (category). """returnself._call_java("truePositiveRateByLabel")@property@since("3.1.0")deffalsePositiveRateByLabel(self)->List[float]:""" Returns false positive rate for each label (category). """returnself._call_java("falsePositiveRateByLabel")@property@since("3.1.0")defprecisionByLabel(self)->List[float]:""" Returns precision for each label (category). """returnself._call_java("precisionByLabel")@property@since("3.1.0")defrecallByLabel(self)->List[float]:""" Returns recall for each label (category). """returnself._call_java("recallByLabel")@since("3.1.0")deffMeasureByLabel(self,beta:float=1.0)->List[float]:""" Returns f-measure for each label (category). """returnself._call_java("fMeasureByLabel",beta)@property@since("3.1.0")defaccuracy(self)->float:""" Returns accuracy. (equals to the total number of correctly classified instances out of the total number of instances.) """returnself._call_java("accuracy")@property@since("3.1.0")defweightedTruePositiveRate(self)->float:""" Returns weighted true positive rate. (equals to precision, recall and f-measure) """returnself._call_java("weightedTruePositiveRate")@property@since("3.1.0")defweightedFalsePositiveRate(self)->float:""" Returns weighted false positive rate. """returnself._call_java("weightedFalsePositiveRate")@property@since("3.1.0")defweightedRecall(self)->float:""" Returns weighted averaged recall. (equals to precision, recall and f-measure) """returnself._call_java("weightedRecall")@property@since("3.1.0")defweightedPrecision(self)->float:""" Returns weighted averaged precision. """returnself._call_java("weightedPrecision")@since("3.1.0")defweightedFMeasure(self,beta:float=1.0)->float:""" Returns weighted averaged f-measure. """returnself._call_java("weightedFMeasure",beta)@inherit_docclass_TrainingSummary(JavaWrapper):""" Abstraction for Training results. .. versionadded:: 3.1.0 """@property@since("3.1.0")defobjectiveHistory(self)->List[float]:""" Objective function (scaled loss + regularization) at each iteration. It contains one more element, the initial state, than number of iterations. """returnself._call_java("objectiveHistory")@property@since("3.1.0")deftotalIterations(self)->int:""" Number of training iterations until termination. """returnself._call_java("totalIterations")@inherit_docclass_BinaryClassificationSummary(_ClassificationSummary):""" Binary classification results for a given model. .. versionadded:: 3.1.0 """@property@since("3.1.0")defscoreCol(self)->str:""" Field in "predictions" which gives the probability or raw prediction of each class as a vector. """returnself._call_java("scoreCol")@propertydefroc(self)->DataFrame:""" Returns the receiver operating characteristic (ROC) curve, which is a Dataframe having two fields (FPR, TPR) with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. .. versionadded:: 3.1.0 Notes ----- `Wikipedia reference <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ """returnself._call_java("roc")@property@since("3.1.0")defareaUnderROC(self)->float:""" Computes the area under the receiver operating characteristic (ROC) curve. """returnself._call_java("areaUnderROC")@property@since("3.1.0")defpr(self)->DataFrame:""" Returns the precision-recall curve, which is a Dataframe containing two fields recall, precision with (0.0, 1.0) prepended to it. """returnself._call_java("pr")@property@since("3.1.0")deffMeasureByThreshold(self)->DataFrame:""" Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. """returnself._call_java("fMeasureByThreshold")@property@since("3.1.0")defprecisionByThreshold(self)->DataFrame:""" Returns a dataframe with two fields (threshold, precision) curve. Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the precision. """returnself._call_java("precisionByThreshold")@property@since("3.1.0")defrecallByThreshold(self)->DataFrame:""" Returns a dataframe with two fields (threshold, recall) curve. Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the recall. """returnself._call_java("recallByThreshold")class_LinearSVCParams(_ClassifierParams,HasRegParam,HasMaxIter,HasFitIntercept,HasTol,HasStandardization,HasWeightCol,HasAggregationDepth,HasThreshold,HasMaxBlockSizeInMB,):""" Params for :py:class:`LinearSVC` and :py:class:`LinearSVCModel`. .. versionadded:: 3.0.0 """threshold:Param[float]=Param(Params._dummy(),"threshold","The threshold in binary classification applied to the linear model"" prediction. This threshold can be any real number, where Inf will make"" all predictions 0.0 and -Inf will make all predictions 1.0.",typeConverter=TypeConverters.toFloat,)def__init__(self,*args:Any)->None:super(_LinearSVCParams,self).__init__(*args)self._setDefault(maxIter=100,regParam=0.0,tol=1e-6,fitIntercept=True,standardization=True,threshold=0.0,aggregationDepth=2,maxBlockSizeInMB=0.0,)
[docs]@since("2.2.0")defsetMaxIter(self,value:int)->"LinearSVC":""" Sets the value of :py:attr:`maxIter`. """returnself._set(maxIter=value)
[docs]@since("2.2.0")defsetRegParam(self,value:float)->"LinearSVC":""" Sets the value of :py:attr:`regParam`. """returnself._set(regParam=value)
[docs]@since("2.2.0")defsetTol(self,value:float)->"LinearSVC":""" Sets the value of :py:attr:`tol`. """returnself._set(tol=value)
[docs]@since("2.2.0")defsetFitIntercept(self,value:bool)->"LinearSVC":""" Sets the value of :py:attr:`fitIntercept`. """returnself._set(fitIntercept=value)
[docs]@since("2.2.0")defsetStandardization(self,value:bool)->"LinearSVC":""" Sets the value of :py:attr:`standardization`. """returnself._set(standardization=value)
[docs]@since("2.2.0")defsetThreshold(self,value:float)->"LinearSVC":""" Sets the value of :py:attr:`threshold`. """returnself._set(threshold=value)
[docs]@since("2.2.0")defsetWeightCol(self,value:str)->"LinearSVC":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]@since("2.2.0")defsetAggregationDepth(self,value:int)->"LinearSVC":""" Sets the value of :py:attr:`aggregationDepth`. """returnself._set(aggregationDepth=value)
[docs]@since("3.1.0")defsetMaxBlockSizeInMB(self,value:float)->"LinearSVC":""" Sets the value of :py:attr:`maxBlockSizeInMB`. """returnself._set(maxBlockSizeInMB=value)
[docs]classLinearSVCModel(_JavaClassificationModel[Vector],_LinearSVCParams,JavaMLWritable,JavaMLReadable["LinearSVCModel"],HasTrainingSummary["LinearSVCTrainingSummary"],):""" Model fitted by LinearSVC. .. versionadded:: 2.2.0 """
[docs]@since("3.0.0")defsetThreshold(self,value:float)->"LinearSVCModel":""" Sets the value of :py:attr:`threshold`. """returnself._set(threshold=value)
@property@since("2.2.0")defcoefficients(self)->Vector:""" Model coefficients of Linear SVM Classifier. """returnself._call_java("coefficients")@property@since("2.2.0")defintercept(self)->float:""" Model intercept of Linear SVM Classifier. """returnself._call_java("intercept")
[docs]@since("3.1.0")defsummary(self)->"LinearSVCTrainingSummary":""" Gets summary (accuracy/precision/recall, objective history, total iterations) of model trained on the training set. An exception is thrown if `trainingSummary is None`. """ifself.hasSummary:returnLinearSVCTrainingSummary(super(LinearSVCModel,self).summary)else:raiseRuntimeError("No training summary available for this %s"%self.__class__.__name__)
[docs]defevaluate(self,dataset:DataFrame)->"LinearSVCSummary":""" Evaluates the model on a test dataset. .. versionadded:: 3.1.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """ifnotisinstance(dataset,DataFrame):raiseTypeError("dataset must be a DataFrame but got %s."%type(dataset))java_lsvc_summary=self._call_java("evaluate",dataset)returnLinearSVCSummary(java_lsvc_summary)
[docs]classLinearSVCSummary(_BinaryClassificationSummary):""" Abstraction for LinearSVC Results for a given model. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassLinearSVCTrainingSummary(LinearSVCSummary,_TrainingSummary):""" Abstraction for LinearSVC Training results. .. versionadded:: 3.1.0 """pass
class_LogisticRegressionParams(_ProbabilisticClassifierParams,HasRegParam,HasElasticNetParam,HasMaxIter,HasFitIntercept,HasTol,HasStandardization,HasWeightCol,HasAggregationDepth,HasThreshold,HasMaxBlockSizeInMB,):""" Params for :py:class:`LogisticRegression` and :py:class:`LogisticRegressionModel`. .. versionadded:: 3.0.0 """threshold:Param[float]=Param(Params._dummy(),"threshold","Threshold in binary classification prediction, in range [0, 1]."+" If threshold and thresholds are both set, they must match."+"e.g. if threshold is p, then thresholds must be equal to [1-p, p].",typeConverter=TypeConverters.toFloat,)family:Param[str]=Param(Params._dummy(),"family","The name of family which is a description of the label distribution to "+"be used in the model. Supported options: auto, binomial, multinomial",typeConverter=TypeConverters.toString,)lowerBoundsOnCoefficients:Param[Matrix]=Param(Params._dummy(),"lowerBoundsOnCoefficients","The lower bounds on coefficients if fitting under bound ""constrained optimization. The bound matrix must be ""compatible with the shape ""(1, number of features) for binomial regression, or ""(number of classes, number of features) ""for multinomial regression.",typeConverter=TypeConverters.toMatrix,)upperBoundsOnCoefficients:Param[Matrix]=Param(Params._dummy(),"upperBoundsOnCoefficients","The upper bounds on coefficients if fitting under bound ""constrained optimization. The bound matrix must be ""compatible with the shape ""(1, number of features) for binomial regression, or ""(number of classes, number of features) ""for multinomial regression.",typeConverter=TypeConverters.toMatrix,)lowerBoundsOnIntercepts:Param[Vector]=Param(Params._dummy(),"lowerBoundsOnIntercepts","The lower bounds on intercepts if fitting under bound ""constrained optimization. The bounds vector size must be""equal with 1 for binomial regression, or the number of""lasses for multinomial regression.",typeConverter=TypeConverters.toVector,)upperBoundsOnIntercepts:Param[Vector]=Param(Params._dummy(),"upperBoundsOnIntercepts","The upper bounds on intercepts if fitting under bound ""constrained optimization. The bound vector size must be ""equal with 1 for binomial regression, or the number of ""classes for multinomial regression.",typeConverter=TypeConverters.toVector,)def__init__(self,*args:Any):super(_LogisticRegressionParams,self).__init__(*args)self._setDefault(maxIter=100,regParam=0.0,tol=1e-6,threshold=0.5,family="auto",maxBlockSizeInMB=0.0)@since("1.4.0")defsetThreshold(self:"P",value:float)->"P":""" Sets the value of :py:attr:`threshold`. Clears value of :py:attr:`thresholds` if it has been set. """self._set(threshold=value)self.clear(self.thresholds)# type: ignore[attr-defined]returnself@since("1.4.0")defgetThreshold(self)->float:""" Get threshold for binary classification. If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), this returns the equivalent threshold: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. Otherwise, returns :py:attr:`threshold` if set or its default value if unset. """self._checkThresholdConsistency()ifself.isSet(self.thresholds):ts=self.getOrDefault(self.thresholds)iflen(ts)!=2:raiseValueError("Logistic Regression getThreshold only applies to"+" binary classification, but thresholds has length != 2."+" thresholds: {ts}".format(ts=ts))return1.0/(1.0+ts[0]/ts[1])else:returnself.getOrDefault(self.threshold)@since("1.5.0")defsetThresholds(self:"P",value:List[float])->"P":""" Sets the value of :py:attr:`thresholds`. Clears value of :py:attr:`threshold` if it has been set. """self._set(thresholds=value)self.clear(self.threshold)# type: ignore[attr-defined]returnself@since("1.5.0")defgetThresholds(self)->List[float]:""" If :py:attr:`thresholds` is set, return its value. Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary classification: (1-threshold, threshold). If neither are set, throw an error. """self._checkThresholdConsistency()ifnotself.isSet(self.thresholds)andself.isSet(self.threshold):t=self.getOrDefault(self.threshold)return[1.0-t,t]else:returnself.getOrDefault(self.thresholds)def_checkThresholdConsistency(self)->None:ifself.isSet(self.threshold)andself.isSet(self.thresholds):ts=self.getOrDefault(self.thresholds)iflen(ts)!=2:raiseValueError("Logistic Regression getThreshold only applies to"+" binary classification, but thresholds has length != 2."+" thresholds: {0}".format(str(ts)))t=1.0/(1.0+ts[0]/ts[1])t2=self.getOrDefault(self.threshold)ifabs(t2-t)>=1e-5:raiseValueError("Logistic Regression getThreshold found inconsistent values for"+" threshold (%g) and thresholds (equivalent to %g)"%(t2,t))@since("2.1.0")defgetFamily(self)->str:""" Gets the value of :py:attr:`family` or its default value. """returnself.getOrDefault(self.family)@since("2.3.0")defgetLowerBoundsOnCoefficients(self)->Matrix:""" Gets the value of :py:attr:`lowerBoundsOnCoefficients` """returnself.getOrDefault(self.lowerBoundsOnCoefficients)@since("2.3.0")defgetUpperBoundsOnCoefficients(self)->Matrix:""" Gets the value of :py:attr:`upperBoundsOnCoefficients` """returnself.getOrDefault(self.upperBoundsOnCoefficients)@since("2.3.0")defgetLowerBoundsOnIntercepts(self)->Vector:""" Gets the value of :py:attr:`lowerBoundsOnIntercepts` """returnself.getOrDefault(self.lowerBoundsOnIntercepts)@since("2.3.0")defgetUpperBoundsOnIntercepts(self)->Vector:""" Gets the value of :py:attr:`upperBoundsOnIntercepts` """returnself.getOrDefault(self.upperBoundsOnIntercepts)
[docs]@since("2.1.0")defsetFamily(self,value:str)->"LogisticRegression":""" Sets the value of :py:attr:`family`. """returnself._set(family=value)
[docs]@since("2.3.0")defsetLowerBoundsOnCoefficients(self,value:Matrix)->"LogisticRegression":""" Sets the value of :py:attr:`lowerBoundsOnCoefficients` """returnself._set(lowerBoundsOnCoefficients=value)
[docs]@since("2.3.0")defsetUpperBoundsOnCoefficients(self,value:Matrix)->"LogisticRegression":""" Sets the value of :py:attr:`upperBoundsOnCoefficients` """returnself._set(upperBoundsOnCoefficients=value)
[docs]@since("2.3.0")defsetLowerBoundsOnIntercepts(self,value:Vector)->"LogisticRegression":""" Sets the value of :py:attr:`lowerBoundsOnIntercepts` """returnself._set(lowerBoundsOnIntercepts=value)
[docs]@since("2.3.0")defsetUpperBoundsOnIntercepts(self,value:Vector)->"LogisticRegression":""" Sets the value of :py:attr:`upperBoundsOnIntercepts` """returnself._set(upperBoundsOnIntercepts=value)
[docs]defsetMaxIter(self,value:int)->"LogisticRegression":""" Sets the value of :py:attr:`maxIter`. """returnself._set(maxIter=value)
[docs]defsetRegParam(self,value:float)->"LogisticRegression":""" Sets the value of :py:attr:`regParam`. """returnself._set(regParam=value)
[docs]defsetTol(self,value:float)->"LogisticRegression":""" Sets the value of :py:attr:`tol`. """returnself._set(tol=value)
[docs]defsetElasticNetParam(self,value:float)->"LogisticRegression":""" Sets the value of :py:attr:`elasticNetParam`. """returnself._set(elasticNetParam=value)
[docs]defsetFitIntercept(self,value:bool)->"LogisticRegression":""" Sets the value of :py:attr:`fitIntercept`. """returnself._set(fitIntercept=value)
[docs]defsetStandardization(self,value:bool)->"LogisticRegression":""" Sets the value of :py:attr:`standardization`. """returnself._set(standardization=value)
[docs]defsetWeightCol(self,value:str)->"LogisticRegression":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]defsetAggregationDepth(self,value:int)->"LogisticRegression":""" Sets the value of :py:attr:`aggregationDepth`. """returnself._set(aggregationDepth=value)
[docs]@since("3.1.0")defsetMaxBlockSizeInMB(self,value:float)->"LogisticRegression":""" Sets the value of :py:attr:`maxBlockSizeInMB`. """returnself._set(maxBlockSizeInMB=value)
[docs]classLogisticRegressionModel(_JavaProbabilisticClassificationModel[Vector],_LogisticRegressionParams,JavaMLWritable,JavaMLReadable["LogisticRegressionModel"],HasTrainingSummary["LogisticRegressionTrainingSummary"],):""" Model fitted by LogisticRegression. .. versionadded:: 1.3.0 """@property@since("2.0.0")defcoefficients(self)->Vector:""" Model coefficients of binomial logistic regression. An exception is thrown in the case of multinomial logistic regression. """returnself._call_java("coefficients")@property@since("1.4.0")defintercept(self)->float:""" Model intercept of binomial logistic regression. An exception is thrown in the case of multinomial logistic regression. """returnself._call_java("intercept")@property@since("2.1.0")defcoefficientMatrix(self)->Matrix:""" Model coefficients. """returnself._call_java("coefficientMatrix")@property@since("2.1.0")definterceptVector(self)->Vector:""" Model intercept. """returnself._call_java("interceptVector")@property@since("2.0.0")defsummary(self)->"LogisticRegressionTrainingSummary":""" Gets summary (accuracy/precision/recall, objective history, total iterations) of model trained on the training set. An exception is thrown if `trainingSummary is None`. """ifself.hasSummary:ifself.numClasses<=2:returnBinaryLogisticRegressionTrainingSummary(super(LogisticRegressionModel,self).summary)else:returnLogisticRegressionTrainingSummary(super(LogisticRegressionModel,self).summary)else:raiseRuntimeError("No training summary available for this %s"%self.__class__.__name__)
[docs]defevaluate(self,dataset:DataFrame)->"LogisticRegressionSummary":""" Evaluates the model on a test dataset. .. versionadded:: 2.0.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """ifnotisinstance(dataset,DataFrame):raiseTypeError("dataset must be a DataFrame but got %s."%type(dataset))java_blr_summary=self._call_java("evaluate",dataset)ifself.numClasses<=2:returnBinaryLogisticRegressionSummary(java_blr_summary)else:returnLogisticRegressionSummary(java_blr_summary)
[docs]classLogisticRegressionSummary(_ClassificationSummary):""" Abstraction for Logistic Regression Results for a given model. .. versionadded:: 2.0.0 """@property@since("2.0.0")defprobabilityCol(self)->str:""" Field in "predictions" which gives the probability of each class as a vector. """returnself._call_java("probabilityCol")@property@since("2.0.0")deffeaturesCol(self)->str:""" Field in "predictions" which gives the features of each instance as a vector. """returnself._call_java("featuresCol")
[docs]@inherit_docclassLogisticRegressionTrainingSummary(LogisticRegressionSummary,_TrainingSummary):""" Abstraction for multinomial Logistic Regression Training results. .. versionadded:: 2.0.0 """pass
[docs]@inherit_docclassBinaryLogisticRegressionSummary(_BinaryClassificationSummary,LogisticRegressionSummary):""" Binary Logistic regression results for a given model. .. versionadded:: 2.0.0 """pass
[docs]@inherit_docclassBinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary,LogisticRegressionTrainingSummary):""" Binary Logistic regression training results for a given model. .. versionadded:: 2.0.0 """pass
@inherit_docclass_DecisionTreeClassifierParams(_DecisionTreeParams,_TreeClassifierParams):""" Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`. """def__init__(self,*args:Any):super(_DecisionTreeClassifierParams,self).__init__(*args)self._setDefault(maxDepth=5,maxBins=32,minInstancesPerNode=1,minInfoGain=0.0,maxMemoryInMB=256,cacheNodeIds=False,checkpointInterval=10,impurity="gini",leafCol="",minWeightFractionPerNode=0.0,)
[docs]defsetMaxDepth(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`maxDepth`. """returnself._set(maxDepth=value)
[docs]defsetMaxBins(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`maxBins`. """returnself._set(maxBins=value)
[docs]defsetMinInstancesPerNode(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`minInstancesPerNode`. """returnself._set(minInstancesPerNode=value)
[docs]@since("3.0.0")defsetMinWeightFractionPerNode(self,value:float)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`minWeightFractionPerNode`. """returnself._set(minWeightFractionPerNode=value)
[docs]defsetMinInfoGain(self,value:float)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`minInfoGain`. """returnself._set(minInfoGain=value)
[docs]defsetMaxMemoryInMB(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`maxMemoryInMB`. """returnself._set(maxMemoryInMB=value)
[docs]defsetCacheNodeIds(self,value:bool)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`cacheNodeIds`. """returnself._set(cacheNodeIds=value)
[docs]@since("1.4.0")defsetImpurity(self,value:str)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`impurity`. """returnself._set(impurity=value)
[docs]@since("1.4.0")defsetCheckpointInterval(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`checkpointInterval`. """returnself._set(checkpointInterval=value)
[docs]defsetSeed(self,value:int)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`seed`. """returnself._set(seed=value)
[docs]@since("3.0.0")defsetWeightCol(self,value:str)->"DecisionTreeClassifier":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]@inherit_docclassDecisionTreeClassificationModel(_DecisionTreeModel,_JavaProbabilisticClassificationModel[Vector],_DecisionTreeClassifierParams,JavaMLWritable,JavaMLReadable["DecisionTreeClassificationModel"],):""" Model fitted by DecisionTreeClassifier. .. versionadded:: 1.4.0 """@propertydeffeatureImportances(self)->Vector:""" Estimate of the importance of each feature. This generalizes the idea of "Gini" importance to other losses, following the explanation of Gini importance from "Random Forests" documentation by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. This feature importance is calculated as follows: - importance(feature j) = sum (over nodes which split on feature j) of the gain, where gain is scaled by the number of instances passing through node - Normalize importances for tree to sum to 1. .. versionadded:: 2.0.0 Notes ----- Feature importance for single decision trees can have high variance due to correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` to determine feature importance instead. """returnself._call_java("featureImportances")
@inherit_docclass_RandomForestClassifierParams(_RandomForestParams,_TreeClassifierParams):""" Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`. """def__init__(self,*args:Any):super(_RandomForestClassifierParams,self).__init__(*args)self._setDefault(maxDepth=5,maxBins=32,minInstancesPerNode=1,minInfoGain=0.0,maxMemoryInMB=256,cacheNodeIds=False,checkpointInterval=10,impurity="gini",numTrees=20,featureSubsetStrategy="auto",subsamplingRate=1.0,leafCol="",minWeightFractionPerNode=0.0,bootstrap=True,)
[docs]defsetMaxDepth(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`maxDepth`. """returnself._set(maxDepth=value)
[docs]defsetMaxBins(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`maxBins`. """returnself._set(maxBins=value)
[docs]defsetMinInstancesPerNode(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`minInstancesPerNode`. """returnself._set(minInstancesPerNode=value)
[docs]defsetMinInfoGain(self,value:float)->"RandomForestClassifier":""" Sets the value of :py:attr:`minInfoGain`. """returnself._set(minInfoGain=value)
[docs]defsetMaxMemoryInMB(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`maxMemoryInMB`. """returnself._set(maxMemoryInMB=value)
[docs]defsetCacheNodeIds(self,value:bool)->"RandomForestClassifier":""" Sets the value of :py:attr:`cacheNodeIds`. """returnself._set(cacheNodeIds=value)
[docs]@since("1.4.0")defsetImpurity(self,value:str)->"RandomForestClassifier":""" Sets the value of :py:attr:`impurity`. """returnself._set(impurity=value)
[docs]@since("1.4.0")defsetNumTrees(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`numTrees`. """returnself._set(numTrees=value)
[docs]@since("3.0.0")defsetBootstrap(self,value:bool)->"RandomForestClassifier":""" Sets the value of :py:attr:`bootstrap`. """returnself._set(bootstrap=value)
[docs]@since("1.4.0")defsetSubsamplingRate(self,value:float)->"RandomForestClassifier":""" Sets the value of :py:attr:`subsamplingRate`. """returnself._set(subsamplingRate=value)
[docs]@since("2.4.0")defsetFeatureSubsetStrategy(self,value:str)->"RandomForestClassifier":""" Sets the value of :py:attr:`featureSubsetStrategy`. """returnself._set(featureSubsetStrategy=value)
[docs]defsetSeed(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`seed`. """returnself._set(seed=value)
[docs]defsetCheckpointInterval(self,value:int)->"RandomForestClassifier":""" Sets the value of :py:attr:`checkpointInterval`. """returnself._set(checkpointInterval=value)
[docs]@since("3.0.0")defsetWeightCol(self,value:str)->"RandomForestClassifier":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]@since("3.0.0")defsetMinWeightFractionPerNode(self,value:float)->"RandomForestClassifier":""" Sets the value of :py:attr:`minWeightFractionPerNode`. """returnself._set(minWeightFractionPerNode=value)
[docs]classRandomForestClassificationModel(_TreeEnsembleModel,_JavaProbabilisticClassificationModel[Vector],_RandomForestClassifierParams,JavaMLWritable,JavaMLReadable["RandomForestClassificationModel"],HasTrainingSummary["RandomForestClassificationTrainingSummary"],):""" Model fitted by RandomForestClassifier. .. versionadded:: 1.4.0 """@propertydeffeatureImportances(self)->Vector:""" Estimate of the importance of each feature. Each feature's importance is the average of its importance across all trees in the ensemble The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) and follows the implementation from scikit-learn. .. versionadded:: 2.0.0 See Also -------- DecisionTreeClassificationModel.featureImportances """returnself._call_java("featureImportances")@property@since("2.0.0")deftrees(self)->List[DecisionTreeClassificationModel]:"""Trees in this ensemble. Warning: These have null parent Estimators."""return[DecisionTreeClassificationModel(m)forminlist(self._call_java("trees"))]@property@since("3.1.0")defsummary(self)->"RandomForestClassificationTrainingSummary":""" Gets summary (accuracy/precision/recall, objective history, total iterations) of model trained on the training set. An exception is thrown if `trainingSummary is None`. """ifself.hasSummary:ifself.numClasses<=2:returnBinaryRandomForestClassificationTrainingSummary(super(RandomForestClassificationModel,self).summary)else:returnRandomForestClassificationTrainingSummary(super(RandomForestClassificationModel,self).summary)else:raiseRuntimeError("No training summary available for this %s"%self.__class__.__name__)
[docs]defevaluate(self,dataset:DataFrame)->Union["BinaryRandomForestClassificationSummary","RandomForestClassificationSummary"]:""" Evaluates the model on a test dataset. .. versionadded:: 3.1.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """ifnotisinstance(dataset,DataFrame):raiseTypeError("dataset must be a DataFrame but got %s."%type(dataset))java_rf_summary=self._call_java("evaluate",dataset)ifself.numClasses<=2:returnBinaryRandomForestClassificationSummary(java_rf_summary)else:returnRandomForestClassificationSummary(java_rf_summary)
[docs]classRandomForestClassificationSummary(_ClassificationSummary):""" Abstraction for RandomForestClassification Results for a given model. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassRandomForestClassificationTrainingSummary(RandomForestClassificationSummary,_TrainingSummary):""" Abstraction for RandomForestClassificationTraining Training results. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassBinaryRandomForestClassificationSummary(_BinaryClassificationSummary):""" BinaryRandomForestClassification results for a given model. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassBinaryRandomForestClassificationTrainingSummary(BinaryRandomForestClassificationSummary,RandomForestClassificationTrainingSummary):""" BinaryRandomForestClassification training results for a given model. .. versionadded:: 3.1.0 """pass
class_GBTClassifierParams(_GBTParams,_HasVarianceImpurity):""" Params for :py:class:`GBTClassifier` and :py:class:`GBTClassifierModel`. .. versionadded:: 3.0.0 """supportedLossTypes:List[str]=["logistic"]lossType:Param[str]=Param(Params._dummy(),"lossType","Loss function which GBT tries to minimize (case-insensitive). "+"Supported options: "+", ".join(supportedLossTypes),typeConverter=TypeConverters.toString,)def__init__(self,*args:Any):super(_GBTClassifierParams,self).__init__(*args)self._setDefault(maxDepth=5,maxBins=32,minInstancesPerNode=1,minInfoGain=0.0,maxMemoryInMB=256,cacheNodeIds=False,checkpointInterval=10,lossType="logistic",maxIter=20,stepSize=0.1,subsamplingRate=1.0,impurity="variance",featureSubsetStrategy="all",validationTol=0.01,leafCol="",minWeightFractionPerNode=0.0,)@since("1.4.0")defgetLossType(self)->str:""" Gets the value of lossType or its default value. """returnself.getOrDefault(self.lossType)
[docs]@inherit_docclassGBTClassifier(_JavaProbabilisticClassifier["GBTClassificationModel"],_GBTClassifierParams,JavaMLWritable,JavaMLReadable["GBTClassifier"],):""" `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_ learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. .. versionadded:: 1.4.0 Notes ----- Multiclass labels are not currently supported. The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. Gradient Boosting vs. TreeBoost: - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. - Both algorithms learn tree ensembles by minimizing loss functions. - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes based on the loss function, whereas the original gradient boosting method does not. - We expect to implement TreeBoost in the future: `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_ Examples -------- >>> from numpy import allclose >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = spark.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") >>> si_model = stringIndexer.fit(df) >>> td = si_model.transform(df) >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42, ... leafCol="leafId") >>> gbt.setMaxIter(5) GBTClassifier... >>> gbt.setMinWeightFractionPerNode(0.049) GBTClassifier... >>> gbt.getMaxIter() 5 >>> gbt.getFeatureSubsetStrategy() 'all' >>> model = gbt.fit(td) >>> model.getLabelCol() 'indexed' >>> model.setFeaturesCol("features") GBTClassificationModel... >>> model.setThresholds([0.3, 0.7]) GBTClassificationModel... >>> model.getThresholds() [0.3, 0.7] >>> model.featureImportances SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.predict(test0.head().features) 0.0 >>> model.predictRaw(test0.head().features) DenseVector([1.1697, -1.1697]) >>> model.predictProbability(test0.head().features) DenseVector([0.9121, 0.0879]) >>> result = model.transform(test0).head() >>> result.prediction 0.0 >>> result.leafId DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]) >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 >>> model.totalNumNodes 15 >>> print(model.toDebugString) GBTClassificationModel...numTrees=5... >>> gbtc_path = temp_path + "gbtc" >>> gbt.save(gbtc_path) >>> gbt2 = GBTClassifier.load(gbtc_path) >>> gbt2.getMaxDepth() 2 >>> model_path = temp_path + "gbtc_model" >>> model.save(model_path) >>> model2 = GBTClassificationModel.load(model_path) >>> model.featureImportances == model2.featureImportances True >>> model.treeWeights == model2.treeWeights True >>> model.transform(test0).take(1) == model2.transform(test0).take(1) True >>> model.trees [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...] >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)], ... ["indexed", "features"]) >>> model.evaluateEachIteration(validation) [0.25..., 0.23..., 0.21..., 0.19..., 0.18...] >>> model.numClasses 2 >>> gbt = gbt.setValidationIndicatorCol("validationIndicator") >>> gbt.getValidationIndicatorCol() 'validationIndicator' >>> gbt.getValidationTol() 0.01 """_input_kwargs:Dict[str,Any]@keyword_onlydef__init__(self,*,featuresCol:str="features",labelCol:str="label",predictionCol:str="prediction",maxDepth:int=5,maxBins:int=32,minInstancesPerNode:int=1,minInfoGain:float=0.0,maxMemoryInMB:int=256,cacheNodeIds:bool=False,checkpointInterval:int=10,lossType:str="logistic",maxIter:int=20,stepSize:float=0.1,seed:Optional[int]=None,subsamplingRate:float=1.0,impurity:str="variance",featureSubsetStrategy:str="all",validationTol:float=0.01,validationIndicatorCol:Optional[str]=None,leafCol:str="",minWeightFractionPerNode:float=0.0,weightCol:Optional[str]=None,):""" __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, \ weightCol=None) """super(GBTClassifier,self).__init__()self._java_obj=self._new_java_obj("org.apache.spark.ml.classification.GBTClassifier",self.uid)kwargs=self._input_kwargsself.setParams(**kwargs)
[docs]defsetMaxDepth(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`maxDepth`. """returnself._set(maxDepth=value)
[docs]defsetMaxBins(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`maxBins`. """returnself._set(maxBins=value)
[docs]defsetMinInstancesPerNode(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`minInstancesPerNode`. """returnself._set(minInstancesPerNode=value)
[docs]defsetMinInfoGain(self,value:float)->"GBTClassifier":""" Sets the value of :py:attr:`minInfoGain`. """returnself._set(minInfoGain=value)
[docs]defsetMaxMemoryInMB(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`maxMemoryInMB`. """returnself._set(maxMemoryInMB=value)
[docs]defsetCacheNodeIds(self,value:bool)->"GBTClassifier":""" Sets the value of :py:attr:`cacheNodeIds`. """returnself._set(cacheNodeIds=value)
[docs]@since("1.4.0")defsetImpurity(self,value:str)->"GBTClassifier":""" Sets the value of :py:attr:`impurity`. """returnself._set(impurity=value)
[docs]@since("1.4.0")defsetLossType(self,value:str)->"GBTClassifier":""" Sets the value of :py:attr:`lossType`. """returnself._set(lossType=value)
[docs]@since("1.4.0")defsetSubsamplingRate(self,value:float)->"GBTClassifier":""" Sets the value of :py:attr:`subsamplingRate`. """returnself._set(subsamplingRate=value)
[docs]@since("2.4.0")defsetFeatureSubsetStrategy(self,value:str)->"GBTClassifier":""" Sets the value of :py:attr:`featureSubsetStrategy`. """returnself._set(featureSubsetStrategy=value)
[docs]@since("3.0.0")defsetValidationIndicatorCol(self,value:str)->"GBTClassifier":""" Sets the value of :py:attr:`validationIndicatorCol`. """returnself._set(validationIndicatorCol=value)
[docs]@since("1.4.0")defsetMaxIter(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`maxIter`. """returnself._set(maxIter=value)
[docs]@since("1.4.0")defsetCheckpointInterval(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`checkpointInterval`. """returnself._set(checkpointInterval=value)
[docs]@since("1.4.0")defsetSeed(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`seed`. """returnself._set(seed=value)
[docs]@since("1.4.0")defsetStepSize(self,value:int)->"GBTClassifier":""" Sets the value of :py:attr:`stepSize`. """returnself._set(stepSize=value)
[docs]@since("3.0.0")defsetWeightCol(self,value:str)->"GBTClassifier":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]@since("3.0.0")defsetMinWeightFractionPerNode(self,value:float)->"GBTClassifier":""" Sets the value of :py:attr:`minWeightFractionPerNode`. """returnself._set(minWeightFractionPerNode=value)
[docs]classGBTClassificationModel(_TreeEnsembleModel,_JavaProbabilisticClassificationModel[Vector],_GBTClassifierParams,JavaMLWritable,JavaMLReadable["GBTClassificationModel"],):""" Model fitted by GBTClassifier. .. versionadded:: 1.4.0 """@propertydeffeatureImportances(self)->Vector:""" Estimate of the importance of each feature. Each feature's importance is the average of its importance across all trees in the ensemble The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) and follows the implementation from scikit-learn. .. versionadded:: 2.0.0 See Also -------- DecisionTreeClassificationModel.featureImportances """returnself._call_java("featureImportances")@property@since("2.0.0")deftrees(self)->List[DecisionTreeRegressionModel]:"""Trees in this ensemble. Warning: These have null parent Estimators."""return[DecisionTreeRegressionModel(m)forminlist(self._call_java("trees"))]
[docs]defevaluateEachIteration(self,dataset:DataFrame)->List[float]:""" Method to compute error or loss for every iteration of gradient boosting. .. versionadded:: 2.4.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """returnself._call_java("evaluateEachIteration",dataset)
class_NaiveBayesParams(_PredictorParams,HasWeightCol):""" Params for :py:class:`NaiveBayes` and :py:class:`NaiveBayesModel`. .. versionadded:: 3.0.0 """smoothing:Param[float]=Param(Params._dummy(),"smoothing","The smoothing parameter, should be >= 0, "+"default is 1.0",typeConverter=TypeConverters.toFloat,)modelType:Param[str]=Param(Params._dummy(),"modelType","The model type which is a string "+"(case-sensitive). Supported options: multinomial (default), bernoulli "+"and gaussian.",typeConverter=TypeConverters.toString,)def__init__(self,*args:Any):super(_NaiveBayesParams,self).__init__(*args)self._setDefault(smoothing=1.0,modelType="multinomial")@since("1.5.0")defgetSmoothing(self)->float:""" Gets the value of smoothing or its default value. """returnself.getOrDefault(self.smoothing)@since("1.5.0")defgetModelType(self)->str:""" Gets the value of modelType or its default value. """returnself.getOrDefault(self.modelType)
[docs]@inherit_docclassNaiveBayes(_JavaProbabilisticClassifier["NaiveBayesModel"],_NaiveBayesParams,HasThresholds,HasWeightCol,JavaMLWritable,JavaMLReadable["NaiveBayes"],):""" Naive Bayes Classifiers. It supports both Multinomial and Bernoulli NB. `Multinomial NB \ <http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html>`_ can handle finitely supported discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a binary (0/1) data, it can also be used as `Bernoulli NB \ <http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html>`_. The input feature values for Multinomial NB and Bernoulli NB must be nonnegative. Since 3.0.0, it supports Complement NB which is an adaptation of the Multinomial NB. Specifically, Complement NB uses statistics from the complement of each class to compute the model's coefficients. The inventors of Complement NB show empirically that the parameter estimates for CNB are more stable than those for Multinomial NB. Like Multinomial NB, the input feature values for Complement NB must be nonnegative. Since 3.0.0, it also supports `Gaussian NB \ <https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Gaussian_naive_Bayes>`_. which can handle continuous data. .. versionadded:: 1.5.0 Examples -------- >>> from pyspark.sql import Row >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([ ... Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), ... Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), ... Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") >>> model = nb.fit(df) >>> model.setFeaturesCol("features") NaiveBayesModel... >>> model.getSmoothing() 1.0 >>> model.pi DenseVector([-0.81..., -0.58...]) >>> model.theta DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1) >>> model.sigma DenseMatrix(0, 0, [...], ...) >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() >>> model.predict(test0.head().features) 1.0 >>> model.predictRaw(test0.head().features) DenseVector([-1.72..., -0.99...]) >>> model.predictProbability(test0.head().features) DenseVector([0.32..., 0.67...]) >>> result = model.transform(test0).head() >>> result.prediction 1.0 >>> result.probability DenseVector([0.32..., 0.67...]) >>> result.rawPrediction DenseVector([-1.72..., -0.99...]) >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() >>> model.transform(test1).head().prediction 1.0 >>> nb_path = temp_path + "/nb" >>> nb.save(nb_path) >>> nb2 = NaiveBayes.load(nb_path) >>> nb2.getSmoothing() 1.0 >>> model_path = temp_path + "/nb_model" >>> model.save(model_path) >>> model2 = NaiveBayesModel.load(model_path) >>> model.pi == model2.pi True >>> model.theta == model2.theta True >>> model.transform(test0).take(1) == model2.transform(test0).take(1) True >>> nb = nb.setThresholds([0.01, 10.00]) >>> model3 = nb.fit(df) >>> result = model3.transform(test0).head() >>> result.prediction 0.0 >>> nb3 = NaiveBayes().setModelType("gaussian") >>> model4 = nb3.fit(df) >>> model4.getModelType() 'gaussian' >>> model4.sigma DenseMatrix(2, 2, [0.0, 0.25, 0.0, 0.0], 1) >>> nb5 = NaiveBayes(smoothing=1.0, modelType="complement", weightCol="weight") >>> model5 = nb5.fit(df) >>> model5.getModelType() 'complement' >>> model5.theta DenseMatrix(2, 2, [...], 1) >>> model5.sigma DenseMatrix(0, 0, [...], ...) """_input_kwargs:Dict[str,Any]@keyword_onlydef__init__(self,*,featuresCol:str="features",labelCol:str="label",predictionCol:str="prediction",probabilityCol:str="probability",rawPredictionCol:str="rawPrediction",smoothing:float=1.0,modelType:str="multinomial",thresholds:Optional[List[float]]=None,weightCol:Optional[str]=None,):""" __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \ modelType="multinomial", thresholds=None, weightCol=None) """super(NaiveBayes,self).__init__()self._java_obj=self._new_java_obj("org.apache.spark.ml.classification.NaiveBayes",self.uid)kwargs=self._input_kwargsself.setParams(**kwargs)
[docs]@since("1.5.0")defsetSmoothing(self,value:float)->"NaiveBayes":""" Sets the value of :py:attr:`smoothing`. """returnself._set(smoothing=value)
[docs]@since("1.5.0")defsetModelType(self,value:str)->"NaiveBayes":""" Sets the value of :py:attr:`modelType`. """returnself._set(modelType=value)
[docs]defsetWeightCol(self,value:str)->"NaiveBayes":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]classNaiveBayesModel(_JavaProbabilisticClassificationModel[Vector],_NaiveBayesParams,JavaMLWritable,JavaMLReadable["NaiveBayesModel"],):""" Model fitted by NaiveBayes. .. versionadded:: 1.5.0 """@property@since("2.0.0")defpi(self)->Vector:""" log of class priors. """returnself._call_java("pi")@property@since("2.0.0")deftheta(self)->Matrix:""" log of class conditional probabilities. """returnself._call_java("theta")@property@since("3.0.0")defsigma(self)->Matrix:""" variance of each feature. """returnself._call_java("sigma")
class_MultilayerPerceptronParams(_ProbabilisticClassifierParams,HasSeed,HasMaxIter,HasTol,HasStepSize,HasSolver,HasBlockSize,):""" Params for :py:class:`MultilayerPerceptronClassifier`. .. versionadded:: 3.0.0 """layers:Param[List[int]]=Param(Params._dummy(),"layers","Sizes of layers from input layer to output layer "+"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 "+"neurons and output layer of 10 neurons.",typeConverter=TypeConverters.toListInt,)solver:Param[str]=Param(Params._dummy(),"solver","The solver algorithm for optimization. Supported "+"options: l-bfgs, gd.",typeConverter=TypeConverters.toString,)initialWeights:Param[Vector]=Param(Params._dummy(),"initialWeights","The initial weights of the model.",typeConverter=TypeConverters.toVector,)def__init__(self,*args:Any):super(_MultilayerPerceptronParams,self).__init__(*args)self._setDefault(maxIter=100,tol=1e-6,blockSize=128,stepSize=0.03,solver="l-bfgs")@since("1.6.0")defgetLayers(self)->List[int]:""" Gets the value of layers or its default value. """returnself.getOrDefault(self.layers)@since("2.0.0")defgetInitialWeights(self)->Vector:""" Gets the value of initialWeights or its default value. """returnself.getOrDefault(self.initialWeights)
[docs]@inherit_docclassMultilayerPerceptronClassifier(_JavaProbabilisticClassifier["MultilayerPerceptronClassificationModel"],_MultilayerPerceptronParams,JavaMLWritable,JavaMLReadable["MultilayerPerceptronClassifier"],):""" Classifier trainer based on the Multilayer Perceptron. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. .. versionadded:: 1.6.0 Examples -------- >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([ ... (0.0, Vectors.dense([0.0, 0.0])), ... (1.0, Vectors.dense([0.0, 1.0])), ... (1.0, Vectors.dense([1.0, 0.0])), ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) >>> mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123) >>> mlp.setMaxIter(100) MultilayerPerceptronClassifier... >>> mlp.getMaxIter() 100 >>> mlp.getBlockSize() 128 >>> mlp.setBlockSize(1) MultilayerPerceptronClassifier... >>> mlp.getBlockSize() 1 >>> model = mlp.fit(df) >>> model.setFeaturesCol("features") MultilayerPerceptronClassificationModel... >>> model.getMaxIter() 100 >>> model.getLayers() [2, 2, 2] >>> model.weights.size 12 >>> testDF = spark.createDataFrame([ ... (Vectors.dense([1.0, 0.0]),), ... (Vectors.dense([0.0, 0.0]),)], ["features"]) >>> model.predict(testDF.head().features) 1.0 >>> model.predictRaw(testDF.head().features) DenseVector([-16.208, 16.344]) >>> model.predictProbability(testDF.head().features) DenseVector([0.0, 1.0]) >>> model.transform(testDF).select("features", "prediction").show() +---------+----------+ | features|prediction| +---------+----------+ |[1.0,0.0]| 1.0| |[0.0,0.0]| 0.0| +---------+----------+ ... >>> mlp_path = temp_path + "/mlp" >>> mlp.save(mlp_path) >>> mlp2 = MultilayerPerceptronClassifier.load(mlp_path) >>> mlp2.getBlockSize() 1 >>> model_path = temp_path + "/mlp_model" >>> model.save(model_path) >>> model2 = MultilayerPerceptronClassificationModel.load(model_path) >>> model.getLayers() == model2.getLayers() True >>> model.weights == model2.weights True >>> model.transform(testDF).take(1) == model2.transform(testDF).take(1) True >>> mlp2 = mlp2.setInitialWeights(list(range(0, 12))) >>> model3 = mlp2.fit(df) >>> model3.weights != model2.weights True >>> model3.getLayers() == model.getLayers() True """_input_kwargs:Dict[str,Any]@keyword_onlydef__init__(self,*,featuresCol:str="features",labelCol:str="label",predictionCol:str="prediction",maxIter:int=100,tol:float=1e-6,seed:Optional[int]=None,layers:Optional[List[int]]=None,blockSize:int=128,stepSize:float=0.03,solver:str="l-bfgs",initialWeights:Optional[Vector]=None,probabilityCol:str="probability",rawPredictionCol:str="rawPrediction",):""" __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ solver="l-bfgs", initialWeights=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction") """super(MultilayerPerceptronClassifier,self).__init__()self._java_obj=self._new_java_obj("org.apache.spark.ml.classification.MultilayerPerceptronClassifier",self.uid)kwargs=self._input_kwargsself.setParams(**kwargs)
[docs]@since("1.6.0")defsetLayers(self,value:List[int])->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`layers`. """returnself._set(layers=value)
[docs]@since("1.6.0")defsetBlockSize(self,value:int)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`blockSize`. """returnself._set(blockSize=value)
[docs]@since("2.0.0")defsetInitialWeights(self,value:Vector)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`initialWeights`. """returnself._set(initialWeights=value)
[docs]defsetMaxIter(self,value:int)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`maxIter`. """returnself._set(maxIter=value)
[docs]defsetSeed(self,value:int)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`seed`. """returnself._set(seed=value)
[docs]defsetTol(self,value:float)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`tol`. """returnself._set(tol=value)
[docs]@since("2.0.0")defsetStepSize(self,value:float)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`stepSize`. """returnself._set(stepSize=value)
[docs]defsetSolver(self,value:str)->"MultilayerPerceptronClassifier":""" Sets the value of :py:attr:`solver`. """returnself._set(solver=value)
[docs]classMultilayerPerceptronClassificationModel(_JavaProbabilisticClassificationModel[Vector],_MultilayerPerceptronParams,JavaMLWritable,JavaMLReadable["MultilayerPerceptronClassificationModel"],HasTrainingSummary["MultilayerPerceptronClassificationTrainingSummary"],):""" Model fitted by MultilayerPerceptronClassifier. .. versionadded:: 1.6.0 """@property@since("2.0.0")defweights(self)->Vector:""" the weights of layers. """returnself._call_java("weights")
[docs]@since("3.1.0")defsummary(self)->"MultilayerPerceptronClassificationTrainingSummary":""" Gets summary (accuracy/precision/recall, objective history, total iterations) of model trained on the training set. An exception is thrown if `trainingSummary is None`. """ifself.hasSummary:returnMultilayerPerceptronClassificationTrainingSummary(super(MultilayerPerceptronClassificationModel,self).summary)else:raiseRuntimeError("No training summary available for this %s"%self.__class__.__name__)
[docs]defevaluate(self,dataset:DataFrame)->"MultilayerPerceptronClassificationSummary":""" Evaluates the model on a test dataset. .. versionadded:: 3.1.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """ifnotisinstance(dataset,DataFrame):raiseTypeError("dataset must be a DataFrame but got %s."%type(dataset))java_mlp_summary=self._call_java("evaluate",dataset)returnMultilayerPerceptronClassificationSummary(java_mlp_summary)
[docs]classMultilayerPerceptronClassificationSummary(_ClassificationSummary):""" Abstraction for MultilayerPerceptronClassifier Results for a given model. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassMultilayerPerceptronClassificationTrainingSummary(MultilayerPerceptronClassificationSummary,_TrainingSummary):""" Abstraction for MultilayerPerceptronClassifier Training results. .. versionadded:: 3.1.0 """pass
class_OneVsRestParams(_ClassifierParams,HasWeightCol):""" Params for :py:class:`OneVsRest` and :py:class:`OneVsRestModelModel`. """classifier:Param[Classifier]=Param(Params._dummy(),"classifier","base binary classifier")@since("2.0.0")defgetClassifier(self)->Classifier:""" Gets the value of classifier or its default value. """returnself.getOrDefault(self.classifier)
[docs]@inherit_docclassOneVsRest(Estimator["OneVsRestModel"],_OneVsRestParams,HasParallelism,MLReadable["OneVsRest"],MLWritable,Generic[CM],):""" Reduction of Multiclass Classification to Binary Classification. Performs reduction using one against all strategy. For a multiclass classification with k classes, train k models (one per class). Each example is scored against all k models and the model with highest score is picked to label the example. .. versionadded:: 2.0.0 Examples -------- >>> from pyspark.sql import Row >>> from pyspark.ml.linalg import Vectors >>> data_path = "data/mllib/sample_multiclass_classification_data.txt" >>> df = spark.read.format("libsvm").load(data_path) >>> lr = LogisticRegression(regParam=0.01) >>> ovr = OneVsRest(classifier=lr) >>> ovr.getRawPredictionCol() 'rawPrediction' >>> ovr.setPredictionCol("newPrediction") OneVsRest... >>> model = ovr.fit(df) >>> model.models[0].coefficients DenseVector([0.5..., -1.0..., 3.4..., 4.2...]) >>> model.models[1].coefficients DenseVector([-2.1..., 3.1..., -2.6..., -2.3...]) >>> model.models[2].coefficients DenseVector([0.3..., -3.4..., 1.0..., -1.1...]) >>> [x.intercept for x in model.models] [-2.7..., -2.5..., -1.3...] >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0, 1.0, 1.0))]).toDF() >>> model.transform(test0).head().newPrediction 0.0 >>> test1 = sc.parallelize([Row(features=Vectors.sparse(4, [0], [1.0]))]).toDF() >>> model.transform(test1).head().newPrediction 2.0 >>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4, 0.3, 0.2))]).toDF() >>> model.transform(test2).head().newPrediction 0.0 >>> model_path = temp_path + "/ovr_model" >>> model.save(model_path) >>> model2 = OneVsRestModel.load(model_path) >>> model2.transform(test0).head().newPrediction 0.0 >>> model.transform(test0).take(1) == model2.transform(test0).take(1) True >>> model.transform(test2).columns ['features', 'rawPrediction', 'newPrediction'] """_input_kwargs:Dict[str,Any]@keyword_onlydef__init__(self,*,featuresCol:str="features",labelCol:str="label",predictionCol:str="prediction",rawPredictionCol:str="rawPrediction",classifier:Optional[Classifier[CM]]=None,weightCol:Optional[str]=None,parallelism:int=1,):""" __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ rawPredictionCol="rawPrediction", classifier=None, weightCol=None, parallelism=1): """super(OneVsRest,self).__init__()self._setDefault(parallelism=1)kwargs=self._input_kwargsself._set(**kwargs)
[docs]@since("2.0.0")defsetClassifier(self,value:Classifier[CM])->"OneVsRest":""" Sets the value of :py:attr:`classifier`. """returnself._set(classifier=value)
[docs]defsetLabelCol(self,value:str)->"OneVsRest":""" Sets the value of :py:attr:`labelCol`. """returnself._set(labelCol=value)
[docs]defsetFeaturesCol(self,value:str)->"OneVsRest":""" Sets the value of :py:attr:`featuresCol`. """returnself._set(featuresCol=value)
[docs]defsetPredictionCol(self,value:str)->"OneVsRest":""" Sets the value of :py:attr:`predictionCol`. """returnself._set(predictionCol=value)
[docs]defsetRawPredictionCol(self,value:str)->"OneVsRest":""" Sets the value of :py:attr:`rawPredictionCol`. """returnself._set(rawPredictionCol=value)
[docs]defsetWeightCol(self,value:str)->"OneVsRest":""" Sets the value of :py:attr:`weightCol`. """returnself._set(weightCol=value)
[docs]defsetParallelism(self,value:int)->"OneVsRest":""" Sets the value of :py:attr:`parallelism`. """returnself._set(parallelism=value)
def_fit(self,dataset:DataFrame)->"OneVsRestModel":labelCol=self.getLabelCol()featuresCol=self.getFeaturesCol()predictionCol=self.getPredictionCol()classifier=self.getClassifier()numClasses=(int(cast(Row,dataset.agg({labelCol:"max"}).head())["max("+labelCol+")"])+1)weightCol=Noneifself.isDefined(self.weightCol)andself.getWeightCol():ifisinstance(classifier,HasWeightCol):weightCol=self.getWeightCol()else:warnings.warn("weightCol is ignored, ""as it is not supported by {} now.".format(classifier))ifweightCol:multiclassLabeled=dataset.select(labelCol,featuresCol,weightCol)else:multiclassLabeled=dataset.select(labelCol,featuresCol)# persist if underlying dataset is not persistent.handlePersistence=dataset.storageLevel==StorageLevel(False,False,False,False)ifhandlePersistence:multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)deftrainSingleClass(index:int)->CM:binaryLabelCol="mc2b$"+str(index)trainingDataset=multiclassLabeled.withColumn(binaryLabelCol,when(multiclassLabeled[labelCol]==float(index),1.0).otherwise(0.0),)paramMap=dict([(classifier.labelCol,binaryLabelCol),(classifier.featuresCol,featuresCol),(classifier.predictionCol,predictionCol),])ifweightCol:paramMap[cast(HasWeightCol,classifier).weightCol]=weightColreturnclassifier.fit(trainingDataset,paramMap)pool=ThreadPool(processes=min(self.getParallelism(),numClasses))models=pool.map(inheritable_thread_target(trainSingleClass),range(numClasses))ifhandlePersistence:multiclassLabeled.unpersist()returnself._copyValues(OneVsRestModel(models=models))
[docs]defcopy(self,extra:Optional["ParamMap"]=None)->"OneVsRest":""" Creates a copy of this instance with a randomly generated uid and some extra params. This creates a deep copy of the embedded paramMap, and copies the embedded and extra parameters over. .. versionadded:: 2.0.0 Examples -------- extra : dict, optional Extra parameters to copy to the new instance Returns ------- :py:class:`OneVsRest` Copy of this instance """ifextraisNone:extra=dict()newOvr=Params.copy(self,extra)ifself.isSet(self.classifier):newOvr.setClassifier(self.getClassifier().copy(extra))returnnewOvr
@classmethoddef_from_java(cls,java_stage:"JavaObject")->"OneVsRest":""" Given a Java OneVsRest, create and return a Python wrapper of it. Used for ML persistence. """featuresCol=java_stage.getFeaturesCol()labelCol=java_stage.getLabelCol()predictionCol=java_stage.getPredictionCol()rawPredictionCol=java_stage.getRawPredictionCol()classifier:Classifier=JavaParams._from_java(java_stage.getClassifier())parallelism=java_stage.getParallelism()py_stage=cls(featuresCol=featuresCol,labelCol=labelCol,predictionCol=predictionCol,rawPredictionCol=rawPredictionCol,classifier=classifier,parallelism=parallelism,)ifjava_stage.isDefined(java_stage.getParam("weightCol")):py_stage.setWeightCol(java_stage.getWeightCol())py_stage._resetUid(java_stage.uid())returnpy_stagedef_to_java(self)->"JavaObject":""" Transfer this instance to a Java OneVsRest. Used for ML persistence. Returns ------- py4j.java_gateway.JavaObject Java object equivalent to this instance. """_java_obj=JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest",self.uid)_java_obj.setClassifier(cast(_JavaClassifier,self.getClassifier())._to_java())_java_obj.setParallelism(self.getParallelism())_java_obj.setFeaturesCol(self.getFeaturesCol())_java_obj.setLabelCol(self.getLabelCol())_java_obj.setPredictionCol(self.getPredictionCol())ifself.isDefined(self.weightCol)andself.getWeightCol():_java_obj.setWeightCol(self.getWeightCol())_java_obj.setRawPredictionCol(self.getRawPredictionCol())return_java_obj
class_OneVsRestSharedReadWrite:@staticmethoddefsaveImpl(instance:Union[OneVsRest,"OneVsRestModel"],sc:SparkContext,path:str,extraMetadata:Optional[Dict[str,Any]]=None,)->None:skipParams=["classifier"]jsonParams=DefaultParamsWriter.extractJsonParams(instance,skipParams)DefaultParamsWriter.saveMetadata(instance,path,sc,paramMap=jsonParams,extraMetadata=extraMetadata)classifierPath=os.path.join(path,"classifier")cast(MLWritable,instance.getClassifier()).save(classifierPath)@staticmethoddefloadClassifier(path:str,sc:SparkContext)->Union[OneVsRest,"OneVsRestModel"]:classifierPath=os.path.join(path,"classifier")returnDefaultParamsReader.loadParamsInstance(classifierPath,sc)@staticmethoddefvalidateParams(instance:Union[OneVsRest,"OneVsRestModel"])->None:elems_to_check:List[Params]=[instance.getClassifier()]ifisinstance(instance,OneVsRestModel):elems_to_check.extend(instance.models)foreleminelems_to_check:ifnotisinstance(elem,MLWritable):raiseValueError(f"OneVsRest write will fail because it contains {elem.uid} "f"which is not writable.")@inherit_docclassOneVsRestReader(MLReader[OneVsRest]):def__init__(self,cls:Type[OneVsRest])->None:super(OneVsRestReader,self).__init__()self.cls=clsdefload(self,path:str)->OneVsRest:metadata=DefaultParamsReader.loadMetadata(path,self.sc)ifnotDefaultParamsReader.isPythonParamsInstance(metadata):returnJavaMLReader(self.cls).load(path)# type: ignore[arg-type]else:classifier=cast(Classifier,_OneVsRestSharedReadWrite.loadClassifier(path,self.sc))ova:OneVsRest=OneVsRest(classifier=classifier)._resetUid(metadata["uid"])DefaultParamsReader.getAndSetParams(ova,metadata,skipParams=["classifier"])returnova@inherit_docclassOneVsRestWriter(MLWriter):def__init__(self,instance:OneVsRest):super(OneVsRestWriter,self).__init__()self.instance=instancedefsaveImpl(self,path:str)->None:_OneVsRestSharedReadWrite.validateParams(self.instance)_OneVsRestSharedReadWrite.saveImpl(self.instance,self.sc,path)
[docs]classOneVsRestModel(Model,_OneVsRestParams,MLReadable["OneVsRestModel"],MLWritable,):""" Model fitted by OneVsRest. This stores the models resulting from training k binary classifiers: one for each class. Each example is scored against all k models, and the model with the highest score is picked to label the example. .. versionadded:: 2.0.0 """
[docs]defsetFeaturesCol(self,value:str)->"OneVsRestModel":""" Sets the value of :py:attr:`featuresCol`. """returnself._set(featuresCol=value)
[docs]defsetPredictionCol(self,value:str)->"OneVsRestModel":""" Sets the value of :py:attr:`predictionCol`. """returnself._set(predictionCol=value)
[docs]defsetRawPredictionCol(self,value:str)->"OneVsRestModel":""" Sets the value of :py:attr:`rawPredictionCol`. """returnself._set(rawPredictionCol=value)
def__init__(self,models:List[ClassificationModel]):super(OneVsRestModel,self).__init__()self.models=modelsifnotisinstance(models[0],JavaMLWritable):return# set java instancejava_models=[cast(_JavaClassificationModel,model)._to_java()formodelinself.models]sc=SparkContext._active_spark_contextassertscisnotNoneandsc._gatewayisnotNonejava_models_array=JavaWrapper._new_java_array(java_models,sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel)# TODO: need to set metadatametadata=JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata")self._java_obj=JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel",self.uid,metadata.empty(),java_models_array,)def_transform(self,dataset:DataFrame)->DataFrame:# determine the input columns: these need to be passed throughorigCols=dataset.columns# add an accumulator column to store predictions of all the modelsaccColName="mbc$acc"+str(uuid.uuid4())initUDF=udf(lambda_:[],ArrayType(DoubleType()))newDataset=dataset.withColumn(accColName,initUDF(dataset[origCols[0]]))# persist if underlying dataset is not persistent.handlePersistence=dataset.storageLevel==StorageLevel(False,False,False,False)ifhandlePersistence:newDataset.persist(StorageLevel.MEMORY_AND_DISK)# update the accumulator column with the result of prediction of modelsaggregatedDataset=newDatasetforindex,modelinenumerate(self.models):rawPredictionCol=self.getRawPredictionCol()columns=origCols+[rawPredictionCol,accColName]# add temporary column to store intermediate scores and updatetmpColName="mbc$tmp"+str(uuid.uuid4())updateUDF=udf(lambdapredictions,prediction:predictions+[prediction.tolist()[1]],ArrayType(DoubleType()),)transformedDataset=model.transform(aggregatedDataset).select(*columns)updatedDataset=transformedDataset.withColumn(tmpColName,updateUDF(transformedDataset[accColName],transformedDataset[rawPredictionCol]),)newColumns=origCols+[tmpColName]# switch out the intermediate column with the accumulator columnaggregatedDataset=updatedDataset.select(*newColumns).withColumnRenamed(tmpColName,accColName)ifhandlePersistence:newDataset.unpersist()ifself.getRawPredictionCol():deffunc(predictions:Iterable[float])->Vector:predArray:List[float]=[]forxinpredictions:predArray.append(x)returnVectors.dense(predArray)rawPredictionUDF=udf(func,VectorUDT())aggregatedDataset=aggregatedDataset.withColumn(self.getRawPredictionCol(),rawPredictionUDF(aggregatedDataset[accColName]))ifself.getPredictionCol():# output the index of the classifier with highest confidence as predictionlabelUDF=udf(lambdapredictions:float(max(enumerate(predictions),key=operator.itemgetter(1))[0]),DoubleType(),)aggregatedDataset=aggregatedDataset.withColumn(self.getPredictionCol(),labelUDF(aggregatedDataset[accColName]))returnaggregatedDataset.drop(accColName)
[docs]defcopy(self,extra:Optional["ParamMap"]=None)->"OneVsRestModel":""" Creates a copy of this instance with a randomly generated uid and some extra params. This creates a deep copy of the embedded paramMap, and copies the embedded and extra parameters over. .. versionadded:: 2.0.0 Parameters ---------- extra : dict, optional Extra parameters to copy to the new instance Returns ------- :py:class:`OneVsRestModel` Copy of this instance """ifextraisNone:extra=dict()newModel=Params.copy(self,extra)newModel.models=[model.copy(extra)formodelinself.models]returnnewModel
@classmethoddef_from_java(cls,java_stage:"JavaObject")->"OneVsRestModel":""" Given a Java OneVsRestModel, create and return a Python wrapper of it. Used for ML persistence. """featuresCol=java_stage.getFeaturesCol()labelCol=java_stage.getLabelCol()predictionCol=java_stage.getPredictionCol()classifier:Classifier=JavaParams._from_java(java_stage.getClassifier())models:List[ClassificationModel]=[JavaParams._from_java(model)formodelinjava_stage.models()]py_stage=cls(models=models).setPredictionCol(predictionCol).setFeaturesCol(featuresCol)py_stage._set(labelCol=labelCol)ifjava_stage.isDefined(java_stage.getParam("weightCol")):py_stage._set(weightCol=java_stage.getWeightCol())py_stage._set(classifier=classifier)py_stage._resetUid(java_stage.uid())returnpy_stagedef_to_java(self)->"JavaObject":""" Transfer this instance to a Java OneVsRestModel. Used for ML persistence. Returns ------- py4j.java_gateway.JavaObject Java object equivalent to this instance. """sc=SparkContext._active_spark_contextassertscisnotNoneandsc._gatewayisnotNonejava_models=[cast(_JavaClassificationModel,model)._to_java()formodelinself.models]java_models_array=JavaWrapper._new_java_array(java_models,sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel)metadata=JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata")_java_obj=JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel",self.uid,metadata.empty(),java_models_array,)_java_obj.set("classifier",cast(_JavaClassifier,self.getClassifier())._to_java())_java_obj.set("featuresCol",self.getFeaturesCol())_java_obj.set("labelCol",self.getLabelCol())_java_obj.set("predictionCol",self.getPredictionCol())ifself.isDefined(self.weightCol)andself.getWeightCol():_java_obj.set("weightCol",self.getWeightCol())return_java_obj
[docs]@since("3.0.0")defsetFactorSize(self,value:int)->"FMClassifier":""" Sets the value of :py:attr:`factorSize`. """returnself._set(factorSize=value)
[docs]@since("3.0.0")defsetFitLinear(self,value:bool)->"FMClassifier":""" Sets the value of :py:attr:`fitLinear`. """returnself._set(fitLinear=value)
[docs]@since("3.0.0")defsetMiniBatchFraction(self,value:float)->"FMClassifier":""" Sets the value of :py:attr:`miniBatchFraction`. """returnself._set(miniBatchFraction=value)
[docs]@since("3.0.0")defsetInitStd(self,value:float)->"FMClassifier":""" Sets the value of :py:attr:`initStd`. """returnself._set(initStd=value)
[docs]@since("3.0.0")defsetMaxIter(self,value:int)->"FMClassifier":""" Sets the value of :py:attr:`maxIter`. """returnself._set(maxIter=value)
[docs]@since("3.0.0")defsetStepSize(self,value:float)->"FMClassifier":""" Sets the value of :py:attr:`stepSize`. """returnself._set(stepSize=value)
[docs]@since("3.0.0")defsetTol(self,value:float)->"FMClassifier":""" Sets the value of :py:attr:`tol`. """returnself._set(tol=value)
[docs]@since("3.0.0")defsetSolver(self,value:str)->"FMClassifier":""" Sets the value of :py:attr:`solver`. """returnself._set(solver=value)
[docs]@since("3.0.0")defsetSeed(self,value:int)->"FMClassifier":""" Sets the value of :py:attr:`seed`. """returnself._set(seed=value)
[docs]@since("3.0.0")defsetFitIntercept(self,value:bool)->"FMClassifier":""" Sets the value of :py:attr:`fitIntercept`. """returnself._set(fitIntercept=value)
[docs]@since("3.0.0")defsetRegParam(self,value:float)->"FMClassifier":""" Sets the value of :py:attr:`regParam`. """returnself._set(regParam=value)
[docs]classFMClassificationModel(_JavaProbabilisticClassificationModel[Vector],_FactorizationMachinesParams,JavaMLWritable,JavaMLReadable["FMClassificationModel"],HasTrainingSummary,):""" Model fitted by :class:`FMClassifier`. .. versionadded:: 3.0.0 """@property@since("3.0.0")defintercept(self)->float:""" Model intercept. """returnself._call_java("intercept")@property@since("3.0.0")deflinear(self)->Vector:""" Model linear term. """returnself._call_java("linear")@property@since("3.0.0")deffactors(self)->Matrix:""" Model factor term. """returnself._call_java("factors")
[docs]@since("3.1.0")defsummary(self)->"FMClassificationTrainingSummary":""" Gets summary (accuracy/precision/recall, objective history, total iterations) of model trained on the training set. An exception is thrown if `trainingSummary is None`. """ifself.hasSummary:returnFMClassificationTrainingSummary(super(FMClassificationModel,self).summary)else:raiseRuntimeError("No training summary available for this %s"%self.__class__.__name__)
[docs]defevaluate(self,dataset:DataFrame)->"FMClassificationSummary":""" Evaluates the model on a test dataset. .. versionadded:: 3.1.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` Test dataset to evaluate model on. """ifnotisinstance(dataset,DataFrame):raiseTypeError("dataset must be a DataFrame but got %s."%type(dataset))java_fm_summary=self._call_java("evaluate",dataset)returnFMClassificationSummary(java_fm_summary)
[docs]classFMClassificationSummary(_BinaryClassificationSummary):""" Abstraction for FMClassifier Results for a given model. .. versionadded:: 3.1.0 """pass
[docs]@inherit_docclassFMClassificationTrainingSummary(FMClassificationSummary,_TrainingSummary):""" Abstraction for FMClassifier Training results. .. versionadded:: 3.1.0 """pass
if__name__=="__main__":importdoctestimportpyspark.ml.classificationfrompyspark.sqlimportSparkSessionglobs=pyspark.ml.classification.__dict__.copy()# The small batch size here ensures that we see multiple batches,# even in these small test examples:spark=SparkSession.builder.master("local[2]").appName("ml.classification tests").getOrCreate()sc=spark.sparkContextglobs["sc"]=scglobs["spark"]=sparkimporttempfiletemp_path=tempfile.mkdtemp()globs["temp_path"]=temp_pathtry:(failure_count,test_count)=doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)spark.stop()finally:fromshutilimportrmtreetry:rmtree(temp_path)exceptOSError:passiffailure_count:sys.exit(-1)