locked
Extending LDA RRS feed

  • Question

  • Hi,

    I am trying to extend the original LDA model by inserting a plate of sentences. So now every sentence will also be assigned a

    topic. I want the sentence topic to be document topic + Gaussian Noise in the model.

    I am having some trouble adding this noise. I am new to Infer.NET and would appreciate any help. Highlighting the changes I made to the original LDA code.

    CODE:

         Range W = new Range(SizeVocab).Named("W");
                Range T = new Range(NumTopics).Named("T");
                Range D = new Range(NumDocs).Named("D");
                NumSentInDoc = Variable.Array<int>(D).Named("NumSentInDoc");
                Range SInD = new Range(NumSentInDoc[D]).Named("SInD");
                NumWordsInSent = Variable.Array(Variable.Array<int>(SInD), D);
                Range WInS = new Range(NumWordsInSent[D][SInD]).Named("WInS");

                // Surround model by a stochastic If block so that we can compute model evidence
                Evidence = Variable.Bernoulli(0.5).Named("Evidence");
                IfBlock evidenceBlock = Variable.If(Evidence);

                Theta = Variable.Array<Vector>(D);
                Theta.SetSparsity(ThetaSparsity);
                Theta.SetValueRange(T);
                ThetaPrior = Variable.Array<Dirichlet>(D).Named("ThetaPrior");
                Theta[D] = Variable<Vector>.Random(ThetaPrior[D]);

                Kappa = Variable.Array(Variable.Array<Vector>(SInD), D);
                Kappa.SetSparsity(ThetaSparsity);
                Kappa.SetValueRange(T);

                Phi = Variable.Array<Vector>(T);
                Phi.SetSparsity(PhiSparsity);
                Phi.SetValueRange(W);
                PhiPrior = Variable.Array<Dirichlet>(T).Named("PhiPrior");
                Phi[T] = Variable<Vector>.Random(PhiPrior[T]);

                Words = Variable.Array(Variable.Array(Variable.Array<int>(WInM), SInU), D).Named("Words");
                WordCounts = Variable.Array(Variable.Array(Variable.Array<double>(WInM), SInU), D).Named("WordCounts");
               
                using (Variable.ForEach(D))
                {
                    using (Variable.ForEach(SInD))
                    {
                        Kappa[D][SInD] = Theta[D];// + GaussianNoise;

                        using (Variable.ForEach(WInS))
                        {
                            using (Variable.Repeat(WordCounts[D][SInD][WInS]))
                            {
                                Variable<int> topic = Variable.Discrete(Kappa[D][SInD]).Named("topic");
                                using (Variable.Switch(topic))
                                    Words[D][SInD][WInS] = Variable.Discrete(Phi[topic]);
                            }
                        }
                    }
                }

                evidenceBlock.CloseBlock();

                ThetaInit = Variable.New<IDistribution<Vector[]>>().Named("ThetaInit");
                Theta.InitialiseTo(ThetaInit);
                Engine = new InferenceEngine(new VariationalMessagePassing());
                Engine.Compiler.ShowWarnings = false;


    • Edited by pc219 Tuesday, November 5, 2013 10:38 PM
    Tuesday, November 5, 2013 10:37 PM

Answers

  • I suspect this is due to the fact that you are making Phi a VectorGaussian-distributed random variable, and then applying the Discrete factor to it. Anyway, here is a self contained implementation with dummy data which you should be able to plug directly into a console project, just to check that the Infer.NET compiler will build this type of model, and I can verify that it does compile. There are a couple of difference from your implementation

    (a) I have made Phi Dirichlet-distributed.

    (b) I initialize logTheta rather than Theta in order to break symmetry.

    There is still a lot of work that you would need to do for scalability, and to check that inference converges for real data, but this should give you a good start.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using MicrosoftResearch.Infer;
    using MicrosoftResearch.Infer.Distributions;
    using MicrosoftResearch.Infer.Factors;
    using MicrosoftResearch.Infer.Maths;
    using MicrosoftResearch.Infer.Models;
    
    namespace ExtendLDAToSentences
    {
        class Program
        {
            static void Main(string[] args)
            {
                var NumTopics = 2;
                var NumDocuments = Variable.New<int>().Named("NumDocuments");
                var SizeVocab = Variable.New<int>().Named("SizeVocab");
    			var ThetaSparsity = Sparsity.Dense;
    			var PhiSparsity = Sparsity.ApproximateWithTolerance(0.00000000001); // Allow for round-off error
    			Range D = new Range(NumDocuments).Named("D");
    			Range W = new Range(SizeVocab).Named("W");
    			Range T = new Range(NumTopics).Named("T");
                var NumSentInDoc = Variable.Array<int>(D).Named("NumSentInDoc");
                Range SInD = new Range(NumSentInDoc[D]).Named("SInD");
                var NumWordsInSent = Variable.Array(Variable.Array<int>(SInD), D);
                Range WInS = new Range(NumWordsInSent[D][SInD]).Named("WInS");
    
                var Noise = PositiveDefiniteMatrix.Identity(NumTopics);
                var LogThetaPrior = Variable.New<VectorGaussian>().Named("LogThetaPrior");
                LogThetaPrior = VectorGaussian.FromMeanAndPrecision(Vector.Zero(NumTopics), PositiveDefiniteMatrix.Identity(NumTopics));
                
                var LogTheta = Variable.Array<Vector>(D);
                LogTheta[D] = Variable<Vector>.Random(LogThetaPrior).ForEach(D);
                var LogKappa = Variable.Array(Variable.Array<Vector>(SInD), D);
                LogKappa.SetSparsity(ThetaSparsity);
                LogKappa[D][SInD] = Variable.VectorGaussianFromMeanAndPrecision(LogTheta[D], Noise).ForEach(SInD);
                var Theta = Variable.Array<Vector>(D);
                Theta.SetSparsity(ThetaSparsity);
                Theta.SetValueRange(T);
                Theta[D] = Variable.Softmax(LogTheta[D]);
    
                var Kappa = Variable.Array(Variable.Array<Vector>(SInD), D);
                Kappa.SetSparsity(ThetaSparsity);
                Kappa.SetValueRange(T);            
                Kappa[D][SInD] = Variable.Softmax(LogKappa[D][SInD]);
    
                var Phi = Variable.Array<Vector>(T);
                Phi.SetSparsity(PhiSparsity);
                Phi.SetValueRange(W);
                var PhiPrior = Variable.Array<Dirichlet>(T).Named("PhiPrior");
                Phi[T] = Variable<Vector>.Random(PhiPrior[T]);
    
                var Words = Variable.Array(Variable.Array(Variable.Array<int>(WInS), SInD), D).Named("Words");
                var WordCounts = Variable.Array(Variable.Array(Variable.Array<double>(WInS), SInD), D).Named("WordCounts");
    
                using (Variable.ForEach(D))
                {
                    using (Variable.ForEach(SInD))
                    {
                        using (Variable.ForEach(WInS))
                        {
                            using (Variable.Repeat(WordCounts[D][SInD][WInS]))
                            {
                                Variable<int> topic = Variable.Discrete(Kappa[D][SInD]).Named("topic");
                                using (Variable.Switch(topic))
                                    Words[D][SInD][WInS] = Variable.Discrete(Phi[topic]);
                            }
                        }
                    }
                }
    
                var LogThetaInit = Variable.New<IDistribution<Vector[]>>().Named("ThetaInit");
                LogTheta.InitialiseTo(LogThetaInit);
                var Engine = new InferenceEngine(new VariationalMessagePassing());
                Engine.Compiler.ShowWarnings = false;
                Engine.Compiler.GivePriorityTo(typeof(SoftmaxOp_BL06));
                Engine.ModelName = "LDAQueryExtensionModel";
    
                // Set up some dummy observed values, just to check this compiles.
                var numDocs = 1;
                NumDocuments.ObservedValue = numDocs;
                int[] numSentInDoc = new int[numDocs];
                int[][] numWordsInSent = new int[numDocs][];
                int[][][] wordIndices = new int[numDocs][][];
                double[][][] wordCounts = new double[numDocs][][];
                for (int i = 0; i < numDocs; i++)
                {
                    int nsd = 1;
                    numSentInDoc[i] = nsd;
                    numWordsInSent[i] = new int[nsd];
                    wordIndices[i] = new int[nsd][];
                    wordCounts[i] = new double[nsd][];
    
                    for (int j = 0; j < nsd; j++)
                    {
                        int nws = 1;
                        numWordsInSent[i][j] = 1;
                        wordIndices[i][j] = new int[nws];
                        wordCounts[i][j] = new double[nws];
    
                        for (int k = 0; k < nws; k++)
                        {
                            wordIndices[i][j][k] = 0;
                            wordCounts[i][j][k] = 1.0;
                        }
                    }
                }
    
                SizeVocab.ObservedValue = 1;
                NumSentInDoc.ObservedValue = numSentInDoc;
                NumWordsInSent.ObservedValue = numWordsInSent;
                Words.ObservedValue = wordIndices;
                WordCounts.ObservedValue = wordCounts;
                LogThetaInit.ObservedValue = GetInitialisation(numDocs, NumTopics, ThetaSparsity);
                PhiPrior.ObservedValue = new Dirichlet[NumTopics];
                for (int i = 0; i < NumTopics; i++)
                    PhiPrior.ObservedValue[i] = Dirichlet.Symmetric(SizeVocab.ObservedValue, 0.1);
                Engine.OptimiseForVariables = new IVariable[] { Theta, Phi };
                var postTheta = Engine.Infer<Dirichlet[]>(Theta);
                var postPhi = Engine.Infer<Dirichlet[]>(Phi);
    
            }
    
            public static IDistribution<Vector[]> GetInitialisation(
                 int numDocs, int numTopics, Sparsity sparsity)
            {
                VectorGaussian standard = VectorGaussian.FromMeanAndPrecision(Vector.Zero(numTopics), PositiveDefiniteMatrix.Identity(numTopics));
                VectorGaussian[] initTheta = new VectorGaussian[numDocs];
                double baseVal = 1.0 / numTopics;
    
                for (int i = 0; i < numDocs; i++)
                {
                    initTheta[i] = VectorGaussian.PointMass(standard.Sample());
                }
                return Distribution<Vector>.Array(initTheta);
            }
        }
    }
    

    • Marked as answer by pc219 Wednesday, November 13, 2013 10:19 PM
    Wednesday, November 13, 2013 10:15 AM
    Owner

All replies

  • Your best bet for adding noise to a probability vector is to do it in Gaussian space, and then use the Softmax factor when you need the probability vector.

    So in your example, the simplest route to get started would be:

    1. define a VariableArray<Vector> variable LogTheta
    2. define a VectorGaussian prior distribution LogThetaPrior with a prior have zero Vector mean and diagonal PositiveDefiniteMatrix precision (as an initial test just use the same prior for all D). To do these you can use factory methods on the Vector and PositiveDefiniteMatrix classes (for example PositiveDefiniteMatrix.Identity(dimension))
    3. LogTheta[D] = Variable<Vector>(LogThetaPrior).ForEach(T);
    4. Define a noise Precision in the form of a diagonal Positive matrix
    5. Then define LogKappa[D][SInD] = Variable.VectorGaussianFromNoiseAndPrecision(LogTheta[D], noise)
    6.  Finally, Kappa[D][SInD] = Variable.Softmax(LogKappa[D][SInD]), and Theta[D] = Variable.Softmax(LogTheta[D])

    Finally, there are various options for SoftMax operators, most of which are quite experimental. I suggest you start off by trying:

    Engine.Compiler.GivePriorityTo(typeof(SoftmaxOp_BL06).

    Try this out on some toy data, see how far you get, and report back. To scale to large numbers of topics, you will probably need to define logTheta as VariableArray<VariableArray<double>, double[][]> defined over D and T so you can avoid using VectorGaussian calculations and messages which are slow and memory hungry.

    John

    Thursday, November 7, 2013 3:01 PM
    Owner
  • Thanks John. I followed what you suggested and in the process had to change all the distributions (topic proportions, words distribution over topics) to be Gaussian ( realise that I am moving away from dirichlets entirely which is another issue to deal with later ).

    But now I am getting the following error during inference:

    [0] System.ArgumentException: int is not of type Discrete for argument 1 of method DiscreteFromDirichletOp.AverageLogFactor(Discrete sample = int, Dirichlet probs = VectorGaussian)
    [1] System.ArgumentException: int is not of type Discrete for argument 1 of method DiscreteFromDirichletOp.AverageLogFactor(Discrete sample = int, Vector probs = VectorGaussian)
    [2] System.ArgumentException: VectorGaussian is not of type Dirichlet for argument 2 of method DiscreteFromDirichletOp.AverageLogFactor(int sample = int, Dirichlet probs = VectorGaussian)
    [3] System.ArgumentException: VectorGaussian is not of type Vector for argument 2 of method DiscreteFromDirichletOp.AverageLogFactor(int sample = int, Vector probs = VectorGaussian)

    Here is what the model looks like now:

    Noise = PositiveDefiniteMatrix.Identity(numTopics); LogThetaPrior = Variable.New<VectorGaussian>().Named("LogThetaPrior"); LogThetaPrior = VectorGaussian.FromMeanAndPrecision(Vector.Zero(numTopics), PositiveDefiniteMatrix.Identity(numTopics)); LogTheta = Variable.Array<Vector>(D); LogTheta[D] = Variable<Vector>.Random(LogThetaPrior).ForEach(D); LogKappa = Variable.Array(Variable.Array<Vector>(SInD), D); LogKappa.SetSparsity(ThetaSparsity); LogKappa[D][SInD] = Variable.VectorGaussianFromMeanAndPrecision(LogTheta[D], Noise).ForEach(SInD);

    Theta = Variable.Array<Vector>(D); Theta.SetSparsity(ThetaSparsity); Theta.SetValueRange(T); Theta[D] = Variable.Softmax(LogTheta[D]); Kappa = Variable.Array(Variable.Array<Vector>(SInD), D); Kappa.SetSparsity(ThetaSparsity); Kappa.SetValueRange(T); Kappa[D][SInD] = Variable.Softmax(LogKappa[D][SInD]); Phi = Variable.Array<Vector>(T); Phi.SetSparsity(PhiSparsity); Phi.SetValueRange(W); PhiGaussianPrior = Variable.Array<VectorGaussian>(T).Named("PhiPrior"); Phi[T] = Variable<Vector>.Random(PhiGaussianPrior[T]); Words = Variable.Array(Variable.Array(Variable.Array<int>(WInS), SInD), D).Named("Words"); WordCounts = Variable.Array(Variable.Array(Variable.Array<double>(WInS), SInD), D).Named("WordCounts"); Using (Variable.ForEach(D)) { using (Variable.ForEach(SInD)) { using (Variable.ForEach(WInS)) { using (Variable.Repeat(WordCounts[D][SInD][WInS])) { Variable<int> topic = Variable.Discrete(Kappa[D][SInD]).Named("topic"); using (Variable.Switch(topic)) Words[D][SInD][WInS] = Variable.Discrete(Phi[topic]); } } } } evidenceBlock.CloseBlock(); ThetaInit = Variable.New<IDistribution<Vector[]>>().Named("ThetaInit"); Theta.InitialiseTo(ThetaInit); Engine = new InferenceEngine(new VariationalMessagePassing()); Engine.Compiler.ShowWarnings = false; Engine.Compiler.GivePriorityTo(typeof(SoftmaxOp_BL06)); Engine.ModelName = "LDAQueryExtensionModel";

    And the inference:

    PhiGaussianPrior.ObservedValue = new VectorGaussian[NumTopics];
                for (int i = 0; i < NumTopics; i++) PhiGaussianPrior.ObservedValue[i] = VectorGaussian.Uniform(SizeVocab);
                Engine.OptimiseForVariables = new IVariable[] { Theta, Kappa, Phi, Evidence };
    
                postTheta = Engine.Infer<VectorGaussian[]>(Theta);
                postPhi = Engine.Infer<VectorGaussian[]>(Phi);
                postKappa = Engine.Infer<VectorGaussian[][]>(Kappa);
                return Engine.Infer<Bernoulli>(Evidence).LogOdds;

    Thanks for your help!

    Monday, November 11, 2013 8:57 PM
  • I suspect this is due to the fact that you are making Phi a VectorGaussian-distributed random variable, and then applying the Discrete factor to it. Anyway, here is a self contained implementation with dummy data which you should be able to plug directly into a console project, just to check that the Infer.NET compiler will build this type of model, and I can verify that it does compile. There are a couple of difference from your implementation

    (a) I have made Phi Dirichlet-distributed.

    (b) I initialize logTheta rather than Theta in order to break symmetry.

    There is still a lot of work that you would need to do for scalability, and to check that inference converges for real data, but this should give you a good start.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using MicrosoftResearch.Infer;
    using MicrosoftResearch.Infer.Distributions;
    using MicrosoftResearch.Infer.Factors;
    using MicrosoftResearch.Infer.Maths;
    using MicrosoftResearch.Infer.Models;
    
    namespace ExtendLDAToSentences
    {
        class Program
        {
            static void Main(string[] args)
            {
                var NumTopics = 2;
                var NumDocuments = Variable.New<int>().Named("NumDocuments");
                var SizeVocab = Variable.New<int>().Named("SizeVocab");
    			var ThetaSparsity = Sparsity.Dense;
    			var PhiSparsity = Sparsity.ApproximateWithTolerance(0.00000000001); // Allow for round-off error
    			Range D = new Range(NumDocuments).Named("D");
    			Range W = new Range(SizeVocab).Named("W");
    			Range T = new Range(NumTopics).Named("T");
                var NumSentInDoc = Variable.Array<int>(D).Named("NumSentInDoc");
                Range SInD = new Range(NumSentInDoc[D]).Named("SInD");
                var NumWordsInSent = Variable.Array(Variable.Array<int>(SInD), D);
                Range WInS = new Range(NumWordsInSent[D][SInD]).Named("WInS");
    
                var Noise = PositiveDefiniteMatrix.Identity(NumTopics);
                var LogThetaPrior = Variable.New<VectorGaussian>().Named("LogThetaPrior");
                LogThetaPrior = VectorGaussian.FromMeanAndPrecision(Vector.Zero(NumTopics), PositiveDefiniteMatrix.Identity(NumTopics));
                
                var LogTheta = Variable.Array<Vector>(D);
                LogTheta[D] = Variable<Vector>.Random(LogThetaPrior).ForEach(D);
                var LogKappa = Variable.Array(Variable.Array<Vector>(SInD), D);
                LogKappa.SetSparsity(ThetaSparsity);
                LogKappa[D][SInD] = Variable.VectorGaussianFromMeanAndPrecision(LogTheta[D], Noise).ForEach(SInD);
                var Theta = Variable.Array<Vector>(D);
                Theta.SetSparsity(ThetaSparsity);
                Theta.SetValueRange(T);
                Theta[D] = Variable.Softmax(LogTheta[D]);
    
                var Kappa = Variable.Array(Variable.Array<Vector>(SInD), D);
                Kappa.SetSparsity(ThetaSparsity);
                Kappa.SetValueRange(T);            
                Kappa[D][SInD] = Variable.Softmax(LogKappa[D][SInD]);
    
                var Phi = Variable.Array<Vector>(T);
                Phi.SetSparsity(PhiSparsity);
                Phi.SetValueRange(W);
                var PhiPrior = Variable.Array<Dirichlet>(T).Named("PhiPrior");
                Phi[T] = Variable<Vector>.Random(PhiPrior[T]);
    
                var Words = Variable.Array(Variable.Array(Variable.Array<int>(WInS), SInD), D).Named("Words");
                var WordCounts = Variable.Array(Variable.Array(Variable.Array<double>(WInS), SInD), D).Named("WordCounts");
    
                using (Variable.ForEach(D))
                {
                    using (Variable.ForEach(SInD))
                    {
                        using (Variable.ForEach(WInS))
                        {
                            using (Variable.Repeat(WordCounts[D][SInD][WInS]))
                            {
                                Variable<int> topic = Variable.Discrete(Kappa[D][SInD]).Named("topic");
                                using (Variable.Switch(topic))
                                    Words[D][SInD][WInS] = Variable.Discrete(Phi[topic]);
                            }
                        }
                    }
                }
    
                var LogThetaInit = Variable.New<IDistribution<Vector[]>>().Named("ThetaInit");
                LogTheta.InitialiseTo(LogThetaInit);
                var Engine = new InferenceEngine(new VariationalMessagePassing());
                Engine.Compiler.ShowWarnings = false;
                Engine.Compiler.GivePriorityTo(typeof(SoftmaxOp_BL06));
                Engine.ModelName = "LDAQueryExtensionModel";
    
                // Set up some dummy observed values, just to check this compiles.
                var numDocs = 1;
                NumDocuments.ObservedValue = numDocs;
                int[] numSentInDoc = new int[numDocs];
                int[][] numWordsInSent = new int[numDocs][];
                int[][][] wordIndices = new int[numDocs][][];
                double[][][] wordCounts = new double[numDocs][][];
                for (int i = 0; i < numDocs; i++)
                {
                    int nsd = 1;
                    numSentInDoc[i] = nsd;
                    numWordsInSent[i] = new int[nsd];
                    wordIndices[i] = new int[nsd][];
                    wordCounts[i] = new double[nsd][];
    
                    for (int j = 0; j < nsd; j++)
                    {
                        int nws = 1;
                        numWordsInSent[i][j] = 1;
                        wordIndices[i][j] = new int[nws];
                        wordCounts[i][j] = new double[nws];
    
                        for (int k = 0; k < nws; k++)
                        {
                            wordIndices[i][j][k] = 0;
                            wordCounts[i][j][k] = 1.0;
                        }
                    }
                }
    
                SizeVocab.ObservedValue = 1;
                NumSentInDoc.ObservedValue = numSentInDoc;
                NumWordsInSent.ObservedValue = numWordsInSent;
                Words.ObservedValue = wordIndices;
                WordCounts.ObservedValue = wordCounts;
                LogThetaInit.ObservedValue = GetInitialisation(numDocs, NumTopics, ThetaSparsity);
                PhiPrior.ObservedValue = new Dirichlet[NumTopics];
                for (int i = 0; i < NumTopics; i++)
                    PhiPrior.ObservedValue[i] = Dirichlet.Symmetric(SizeVocab.ObservedValue, 0.1);
                Engine.OptimiseForVariables = new IVariable[] { Theta, Phi };
                var postTheta = Engine.Infer<Dirichlet[]>(Theta);
                var postPhi = Engine.Infer<Dirichlet[]>(Phi);
    
            }
    
            public static IDistribution<Vector[]> GetInitialisation(
                 int numDocs, int numTopics, Sparsity sparsity)
            {
                VectorGaussian standard = VectorGaussian.FromMeanAndPrecision(Vector.Zero(numTopics), PositiveDefiniteMatrix.Identity(numTopics));
                VectorGaussian[] initTheta = new VectorGaussian[numDocs];
                double baseVal = 1.0 / numTopics;
    
                for (int i = 0; i < numDocs; i++)
                {
                    initTheta[i] = VectorGaussian.PointMass(standard.Sample());
                }
                return Distribution<Vector>.Array(initTheta);
            }
        }
    }
    

    • Marked as answer by pc219 Wednesday, November 13, 2013 10:19 PM
    Wednesday, November 13, 2013 10:15 AM
    Owner
  • Thanks John...that works! I needed to infer postTheta and postPhi as arrays of dirichlets (dirchlet[]).

    Thanks a lot!

    Wednesday, November 13, 2013 10:21 PM