Recursive Scraping using different styles with MySQL Database.

Hi,
I don't have too much time to write Scrapy Tutorial for you people but this example may be a help full hand to the scrapy beginners.

this program recursively scrap youtube.com and the link for scraping comes from s database and the results store in database.

here is
Items.py
---------------------------------------------------------------------------------------------------------------------------------
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class Project2Item(Item):
    # define the fields for your item here like:
    # name = Field()
    videoLink = Field()
    videoDuration= Field()
    imageVideo = Field()
    titleVideo = Field()
    Description = Field()
    view = Field()
    pass
---------------------------------------------------------------------------------------------------------------------

here is your pipelines.py
-------------------------------------------------------------------------------------------------------------------------------
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import re
class Project2Pipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(host='localhost',
user ='root',
passwd = 'froot',
db = 'prime',
charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""insert into scrapyTest (videoLink, videoTitle,videoView,flag)
VALUES (%s, %s, %s,1)""",(item['videoLink'],item['titleVideo'][0],item['view'][0]))
self.conn.commit()
except MySQLdb.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])
 
         return item
--------------------------------------------------------------------------------------------------------------------------------
here is your settings.py
======================================================================
# Scrapy settings for project2 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'project2'

SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
CONCURRENT_REQUESTS_PER_DOMAIN= 1
ITEM_PIPELINES = {
    'project2.pipelines.Project2Pipeline': 300,

}


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'project2 (+http://www.yourdomain.com)'
-------------------------------------------------------------------------------------------------------------------------------
here is your different spiders.py files:
========================================================================
from scrapy.spider import Spider
from scrapy.selector import Selector
from project2.items import Project2Item
import urlparse
from scrapy.http.request import Request
import MySQLdb
class spider(Spider):
name="Youtube"
allowed_domains = ["youtube.com"]
conn = MySQLdb.connect(host='localhost',user='root',passwd = 'froot',db = 'prime')
cursor = conn.cursor()
cursor.execute("select pageLink from scrapyTest where flag=2")
row = [item[0] for item in cursor.fetchall()]
print row
start_urls = row #["https://www.youtube.com/results?search_query=live+music+INdia&page=1"]
def parse(self,response):
sel = Selector(response)
next_page = sel.xpath("//div[@role='navigation']/a[@data-link-type='next']/@href").extract()
        listing = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in listing:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
for item in items:
yield item
if not not next_page:
yield Request("http://www.youtube.com"+next_page[0],self.parse)

---------------------------------------------------------------------------------------------------------------------------------
second method:
========================================================================
from scrapy.selector import Selector
from scrapy.spider import Spider
from project2.items import Project2Item
from scrapy.http.request import Request

class youtubespider(Spider):
name = "youtube2"
allowed_domains = "www.youtube.com"
links=[]
for page in range(50):
x = 'https://www.youtube.com/results?search_query=live+music+concert+india&page=%s'%page
links.append(x)
start_urls = links
def parse(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items

=======================================================================
third method:
----------------------------------------------------------------------------------------------------------------------------------
from scrapy.selector import Selector
from project2.items import Project2Item
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class youtubespider(CrawlSpider):
name = "bollywoodMovies"
allowed_domains = ["www.youtube.com"]
start_urls = ['https://www.youtube.com/results?search_query=bollywood+movies&page=1']
rules = (Rule (SgmlLinkExtractor(allow=("page=\d+", )), callback="parseitem",follow=True,), )

def parseitem(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[2]/span/@data-original-html").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items


========================================================================
fourth style: Try Youself

I hope this may be help full for you
----------------------------------------------------------------------------------------------------------------------------------

1 comment:

Realtime Experts said...

Hi, Great.. Tutorial is just awesome..It is really helpful for a newbie like me.. I am a regular follower of your blog. Really very informative post you shared here. Kindly keep blogging. If anyone wants to become a .Net developer learn from Dot Net Training in Chennai. or learn thru ASP.NET Essential Training Online . Nowadays Dot Net has tons of job opportunities on various vertical industry.DataScience with Python Training in Bangalore