The Information Technology World: April 2014

Hi,
I don't have too much time to write Scrapy Tutorial for you people but this example may be a help full hand to the scrapy beginners.

this program recursively scrap youtube.com and the link for scraping comes from s database and the results store in database.

here is
Items.py
---------------------------------------------------------------------------------------------------------------------------------
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class Project2Item(Item):
# define the fields for your item here like:
# name = Field()
videoLink = Field()
videoDuration= Field()
imageVideo = Field()
titleVideo = Field()
Description = Field()
view = Field()
pass
---------------------------------------------------------------------------------------------------------------------

here is your pipelines.py
-------------------------------------------------------------------------------------------------------------------------------
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import re
class Project2Pipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(host='localhost',
user ='root',
passwd = 'froot',
db = 'prime',
charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""insert into scrapyTest (videoLink, videoTitle,videoView,flag)
VALUES (%s, %s, %s,1)""",(item['videoLink'],item['titleVideo'][0],item['view'][0]))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])

return item
--------------------------------------------------------------------------------------------------------------------------------
here is your settings.py
======================================================================
# Scrapy settings for project2 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'project2'

SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
CONCURRENT_REQUESTS_PER_DOMAIN= 1
ITEM_PIPELINES = {
'project2.pipelines.Project2Pipeline': 300,

}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'project2 (+http://www.yourdomain.com)'
-------------------------------------------------------------------------------------------------------------------------------
here is your different spiders.py files:
========================================================================
from scrapy.spider import Spider
from scrapy.selector import Selector
from project2.items import Project2Item
import urlparse
from scrapy.http.request import Request
import MySQLdb
class spider(Spider):
name="Youtube"
allowed_domains = ["youtube.com"]
conn = MySQLdb.connect(host='localhost',user='root',passwd = 'froot',db = 'prime')
cursor = conn.cursor()
cursor.execute("select pageLink from scrapyTest where flag=2")
row = [item[0] for item in cursor.fetchall()]
print row
start_urls = row #["https://www.youtube.com/results?search_query=live+music+INdia&page=1"]
def parse(self,response):
sel = Selector(response)
next_page = sel.xpath("//div[@role='navigation']/a[@data-link-type='next']/@href").extract()
listing = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in listing:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
for item in items:
yield item
if not not next_page:
yield Request("http://www.youtube.com"+next_page[0],self.parse)

---------------------------------------------------------------------------------------------------------------------------------
second method:
========================================================================
from scrapy.selector import Selector
from scrapy.spider import Spider
from project2.items import Project2Item
from scrapy.http.request import Request

class youtubespider(Spider):
name = "youtube2"
allowed_domains = "www.youtube.com"
links=[]
for page in range(50):
x = 'https://www.youtube.com/results?search_query=live+music+concert+india&page=%s'%page
links.append(x)
start_urls = links
def parse(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items

=======================================================================
third method:
----------------------------------------------------------------------------------------------------------------------------------
from scrapy.selector import Selector
from project2.items import Project2Item
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class youtubespider(CrawlSpider):
name = "bollywoodMovies"
allowed_domains = ["www.youtube.com"]
start_urls = ['https://www.youtube.com/results?search_query=bollywood+movies&page=1']
rules = (Rule (SgmlLinkExtractor(allow=("page=\d+", )), callback="parseitem",follow=True,), )

def parseitem(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[2]/span/@data-original-html").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items

========================================================================
fourth style: Try Youself

I hope this may be help full for you
----------------------------------------------------------------------------------------------------------------------------------


from selenium import webdriver



# iPhone

driver = webdriver.Remote(browser_name="iphone", command_executor='http://172.24.101.36:3001/hub')



# Android

driver = webdriver.Remote(browser_name="android", command_executor='http://127.0.0.1:8080/hub')



# Google Chrome 

driver = webdriver.Chrome()



# Firefox 

driver = webdriver.Firefox()



# ------------------------------

# The actual test scenario: Test the codepad.org code execution service.



# Go to codepad.org

driver.get('http://codepad.org')



# Select the Python language option

python_link = driver.find_elements_by_xpath("//input[@name='lang' and @value='Python']")[0]

python_link.click()



# Enter some text!

text_area = driver.find_element_by_id('textarea')

text_area.send_keys("print 'Hello,' + ' World!'")



# Submit the form!

submit_button = driver.find_element_by_name('submit')

submit_button.click()



# Make this an actual test. Isn't Python beautiful?

assert "Hello, World!" in driver.get_page_source()



# Close the browser!

driver.quit()

The Information Technology World

Pages

Pages

How to increase your net speed by removing ads on the browser page!!

Google Python Class Day 1 Part 1

Python Insider: Python 3.3.3 relased

Recursive Scraping using different styles with MySQL Database.

Selenium Example for iphone and android