locked
Multi threaded web crawler RRS feed

  • Question

  • Hi all,

    I would like some suggestions regarding a multi threaded web crawler. Up till now I've managed to create several threads, which each and every one of them will access a queue to retieve or store URLs. As you will see in the code below, each crawler will dequeue a URL from the queue, create a web request using that URL and gets back the response stream. The response stream will then be passed to an html parser which will create a tree representing the html document. The crawler will save each new URL in the queue. This will keep going until there are no more URLs in the queue.

    I would like to do now, is to be able to display the queue in a ListView object without decreasing the performance of the application. Can anyone shed some light on this please?! If possible, could you please check the code below and give me some feedback and how I can improve it?! Thanks a million!!

    public partial class Form1 : Form  
        {  
            private UrlRepository _urlRepository;  
            private Crawler[] _crwCrawlers;  
      
            public Form1()  
            {  
                InitializeComponent();  
                _urlRepository = new UrlRepository();  
                _crwCrawlers = new Crawler[5];  
            }  
      
            private void btnStart_Click(object sender, EventArgs e)  
            {  
                _urlRepository.Enqueue(txtUrl.Text);  
                  
                Thread t;  
      
                for (int i = 0; i < _crwCrawlers.Length; i++)  
                {  
                    _crwCrawlers[i] = new Crawler("C" + i, _urlRepository);  
                    t = new Thread(new ThreadStart(_crwCrawlers[i].Start));  
                    t.Start();  
                }  
      
                btnStart.Enabled = false;  
                btnStop.Enabled = true;  
            }  
      
            private void btnStop_Click(object sender, EventArgs e)  
            {  
                btnStop.Enabled = false;  
                btnStart.Enabled = true;  
      
                foreach (Crawler c in _crwCrawlers)  
                {  
                    c.Stop();  
                }  
            }  
        }  
      
        public class UrlRepository  
        {  
            private Queue<string> _queUrls = new Queue<string>();  
      
            public void Enqueue(string url)  
            {  
                lock (this)  
                {  
                    Debug.WriteLine(String.Format("INFO: Enqueue - {0}", url));  
      
                    _queUrls.Enqueue(url);  
                    Monitor.Pulse(this);   
                }  
            }  
      
            public string Dequeue()  
            {  
                string sUrl = "";  
      
                lock (this)  
                {  
                    if (_queUrls.Count == 0)  
                    {  
                        try  
                        {  
                            Monitor.Wait(this);  
                        }  
                        catch (SynchronizationLockException sle)  
                        {  
                            Console.WriteLine(sle.Message);  
                        }  
                        catch (ThreadInterruptedException tie)  
                        {  
                            Console.WriteLine(tie.Message);  
                        }  
                    }  
                    else  
                    {  
                        sUrl = _queUrls.Dequeue();  
                    }  
                }  
      
                Debug.WriteLine(String.Format("INFO: Dequeue - {0}", sUrl));  
      
                return sUrl;  
            }  
        }  
      
        public class Crawler  
        {  
            private bool _bStop;  
            private string _sName;  
            private UrlRepository _urlRepository;          
      
            public Crawler(string name, UrlRepository urlRepository)  
            {  
                _bStop = false;  
                _sName = name;  
                _urlRepository = urlRepository;              
            }  
      
            public void Start()  
            {  
                string sUrl = "";  
                FileStream fsOut = new FileStream(_sName + ".txt", FileMode.Append, FileAccess.Write);  
                StreamWriter swWriter = new StreamWriter(fsOut);  
      
                Debug.WriteLine(String.Format("INFO {0} : Starting crawler", _sName));             
      
                while (!_bStop)  
                {  
                    if ((sUrl = _urlRepository.Dequeue()) != "")  
                    {  
                        Debug.WriteLine(String.Format("INFO {0}: Crawling - {1}", _sName, sUrl));  
      
                        try  
                        {  
                            WebRequest webRequest = HttpWebRequest.Create(sUrl);  
                            WebResponse webResponse = webRequest.GetResponse();  
                            XComponents.Html.HtmlDocument htmlDoc = new XComponents.Html.HtmlDocument(webResponse.GetResponseStream());  
      
                            foreach (HtmlNode n in htmlDoc.Nodes.FindNodes("a"))  
                            {  
                                foreach (HtmlAttribute a in n.Attributes)  
                                {  
                                    if (a.Value.Contains("http://"))  
                                    {  
                                        _urlRepository.Enqueue(a.Value);  
                                        swWriter.WriteLine(a.Value);  
                                    }  
                                }  
                            }  
      
                            swWriter.Flush();  
                        }  
                        catch (Exception ex)  
                        {  
                            Debug.WriteLine(String.Format("ERROR {0} : {1}", _sName, ex.Message));  
                        }  
                    }  
                }  
      
                swWriter.Close();  
                Debug.WriteLine(String.Format("INFO {0} : Crawler stopped", _sName));  
            }  
      
            public void Stop()  
            {  
                _bStop = true;  
                Debug.WriteLine(String.Format("INFO {0} : Stopping crawler", _sName));  
            }  
        }  

    Thursday, June 19, 2008 7:08 PM

Answers

  •  For questions and discussions regarding client application development using Windows Forms controls and features, please see http://forums.microsoft.com/msdn/ShowForum.aspx?ForumID=8&SiteID=1

    You'll have to marshal or dispatch calls from your background thread to your UI (i.e. you can't call myListView.Items.Add from a background thread).  This can be done directly from your background thread by checking the Control.InvokeRequired property; and if true, call Control.Invoke with a delegate to a method that calls ListView.Items.Add.

    Likely, though, since you're starting numerous background threads based on queued information that might become a performance bottleneck.  In which case, I'd suggest using a queue of text to add to your list view, then have your form check this queue periodically (once every half second for example through a Windows.Forms.Timer component) and take the data in the queue and add it to the list.  Be sure to synchronize access to this queue so that it can't be updated while you're removing items from it.
    http://www.peterRitchie.com/blog
    Friday, June 20, 2008 3:41 PM
  • This thread was moved into the Off Topic forum. I can move it into a specific forum for you if you want.

    Thanks!


    Ed Price (a.k.a User Ed), SQL Server Experience Program Manager (Blog, Twitter, Wiki)

    Friday, May 11, 2012 4:36 AM

All replies

  •  For questions and discussions regarding client application development using Windows Forms controls and features, please see http://forums.microsoft.com/msdn/ShowForum.aspx?ForumID=8&SiteID=1

    You'll have to marshal or dispatch calls from your background thread to your UI (i.e. you can't call myListView.Items.Add from a background thread).  This can be done directly from your background thread by checking the Control.InvokeRequired property; and if true, call Control.Invoke with a delegate to a method that calls ListView.Items.Add.

    Likely, though, since you're starting numerous background threads based on queued information that might become a performance bottleneck.  In which case, I'd suggest using a queue of text to add to your list view, then have your form check this queue periodically (once every half second for example through a Windows.Forms.Timer component) and take the data in the queue and add it to the list.  Be sure to synchronize access to this queue so that it can't be updated while you're removing items from it.
    http://www.peterRitchie.com/blog
    Friday, June 20, 2008 3:41 PM
  • This thread was moved into the Off Topic forum. I can move it into a specific forum for you if you want.

    Thanks!


    Ed Price (a.k.a User Ed), SQL Server Experience Program Manager (Blog, Twitter, Wiki)

    Friday, May 11, 2012 4:36 AM